get_article3.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import json
  4. from deepseek.ds_api import DS
  5. from tools.new_mysql import MySQLUploader
  6. from tools.loglog import logger, log_err_e
  7. from tools.thread_pool_manager import pool_executor
  8. from common.common_data import all_exchange_words
  9. from common.split_text import split_text_to_word
  10. from pydantic import BaseModel
  11. from cachetools import TTLCache
  12. from concurrent.futures import wait
  13. from random import randint, shuffle
  14. import json
  15. import requests
  16. def get_article_difficulty(article) -> int:
  17. """获取文章的难度值"""
  18. url = "http://qbank.yunzhixue.cn/api/article/analysis"
  19. data = {"body": article, "question": ""}
  20. try:
  21. response = requests.post(url, json=data)
  22. except Exception as e:
  23. log_err_e(e, msg="获取文章难度值;")
  24. return 0
  25. if response.status_code == 200:
  26. difficult_value = response.json()['data']['difficult']
  27. return difficult_value
  28. else:
  29. logger.error(f"错误状态码{response.status_code}")
  30. def find_interval(number):
  31. """
  32. 判断一个数字属于哪个难度等级区间。31级是例外情况,需要排查
  33. :param number: 要检查的数字。
  34. :return: 返回包含该数字的区间,如果没有找到,则返回 None。
  35. """
  36. intervals = [(1, 200), (201, 250), (251, 300), (301, 350), (351, 400), (401, 450), (451, 550), (551, 650), (651, 750), (751, 850),
  37. (851, 950),
  38. (951, 1100),
  39. (1101, 1250), (1251, 1400), (1401, 1550), (1551, 1700), (1701, 1900), (1901, 2100), (2101, 2300), (2301, 2600),
  40. (2601, 2900),
  41. (2901, 3200),
  42. (3201, 3500), (3501, 3900), (3901, 4300), (4301, 4700), (4701, 5100), (5101, 5500), (5501, 5900), (5901, 6500),
  43. (6501, 99999)]
  44. for index, (start, end) in enumerate(intervals, start=1):
  45. if start <= number <= end:
  46. return index
  47. logger.error(f"文章难度判断不对:{number}")
  48. return 0
  49. def parse_question(question_block):
  50. question_info = {}
  51. question_match = re.search(r'问题:\s*(.*)', question_block)
  52. if question_match:
  53. question_info['trunk'] = question_match.group(1).strip()
  54. analysis_match = re.search(r'解析:\s*(.*)', question_block)
  55. if analysis_match:
  56. question_info['analysis'] = analysis_match.group(1).strip()
  57. options_match = re.search(r'选项:(.*)', question_block)
  58. if options_match:
  59. options_text = options_match.group(1).strip()
  60. options_list = re.split(r'\s*[BCDA]\.\s*', options_text)[1:]
  61. candidates = []
  62. for i, option_text in enumerate(options_list, start=65):
  63. label = chr(i)
  64. text = option_text.strip()
  65. candidates.append({
  66. "label": label,
  67. "text": text,
  68. "isRight": 0
  69. })
  70. question_info['candidates'] = candidates
  71. answer_match = re.search(r'答案:([ABCD])', question_block)
  72. if answer_match and 'candidates' in question_info:
  73. correct_label = answer_match.group(1)
  74. for candidate in question_info['candidates']:
  75. if candidate['label'] == correct_label:
  76. candidate['isRight'] = 1
  77. return question_info
  78. class GetArticle:
  79. def __init__(self):
  80. self.m = MySQLUploader()
  81. self.ds = DS()
  82. self.callback_url_dict = {}
  83. self.real_ip_dict = {}
  84. self.demo_name = {}
  85. self.punctuation = [",", ".", "!", "?", ":", ";", '"', "–", "_", "-", "...", "......"]
  86. all_exchange_words.update(self.punctuation)
  87. def parser_insert_to_mysql(self, resp_result):
  88. for single_article in resp_result['articles']:
  89. article = single_article['body']
  90. article_json = json.dumps(single_article)
  91. difficult_value = find_interval(get_article_difficulty(article))
  92. if not difficult_value:
  93. logger.error("文章难度等级为0;")
  94. sql = "INSERT INTO spring_bamboo_article (article_json,difficult_level) VALUES (%s,%s)"
  95. self.m.execute_(sql, (article_json, difficult_value))
  96. def submit_task(self, words_meaning_list: list, take_count: int, student_stage: int, real_ip: str, demo_name: str):
  97. """
  98. words_meaning_ids: 词义id 包含词义ID的数组集合,用于生成文章。- 示例:[110, 111, 112, 113, 114]
  99. take_count: 取文章数量 (int类型,正常是2篇,最大8篇)
  100. student_stage: 学段(int类型:1.小学;2.初中;3.高中;)
  101. demo_name: 项目名称
  102. """
  103. task_id = randint(10000000, 99999999)
  104. words_meaning_str = ";".join([i["spell"] + ":" + i["meaning"] for i in words_meaning_list])
  105. logger.info(f"生成文章id。task_id:{task_id}。词义组:{words_meaning_str}.")
  106. self.real_ip_dict[task_id] = real_ip
  107. self.demo_name[task_id] = demo_name
  108. try:
  109. resp_result = self.run_task(words_meaning_list, task_id, take_count, student_stage)
  110. self.parser_insert_to_mysql(resp_result)
  111. return resp_result
  112. except Exception as e:
  113. err_msg = f"GetArticle提交任务失败{type(e).__name__},{e}"
  114. log_err_e(e, msg="GetArticle提交任务失败;")
  115. return err_msg
  116. def get_article(self, words_meaning_list, student_stage, task_id, take_count) -> dict:
  117. diffculty_control = {
  118. 1: {"grade": "小学", "article_word_count": 60, "desc_difficulty": "最简单最容易没有难度", "paragraph_count": 1,
  119. "desc2": "文章整体非常简洁,通俗易懂,适合初学者,刚入门,单词全是最常见的,语句通顺即可。",
  120. "choice_desc": "选择题难度尽可能简单,但是不要让所有选择题让其直接在文中找到答案,允许1-2个选择题很简单,参考中国小学生水平"},
  121. 2: {"grade": "初中", "article_word_count": 200, "desc_difficulty": "简单、常见、难度低", "paragraph_count": 3,
  122. "desc2": "文章整体难度适中,大约和中国初中生,中国CET-3,雅思4分这样的难度标准。",
  123. "choice_desc": "选择题难度适中,但是不要所有选择题让其直接在文中找到答案,参考中国初中生水平,中考标准。"},
  124. 3: {"grade": "高中", "article_word_count": 300, "desc_difficulty": "常见、高中难度的", "paragraph_count": 3,
  125. "desc2": "文章整体难度适中,大约和中国的高中生,中国CET-4,雅思5分这样的难度标准。",
  126. "choice_desc": "选择题难度偏难,要有迷惑性,不要出现直接在文中找到答案,参考中国高中生水平,高考标准。"}
  127. }
  128. take_count_dict = {0: "", 1: "一", 2: "二", 3: "三", 4: "四", 5: "五", 6: "六", 7: "七", 8: "八"}
  129. different_cou = take_count_dict.get(take_count, "")
  130. grade = diffculty_control[student_stage]["grade"]
  131. select_word_count = diffculty_control[student_stage]["article_word_count"]
  132. select_diffculty = diffculty_control[student_stage]["desc_difficulty"]
  133. select_paragraph_count = diffculty_control[student_stage]["paragraph_count"]
  134. desc2 = diffculty_control[student_stage]["desc2"]
  135. choice_desc = diffculty_control[student_stage]["choice_desc"]
  136. shuffle(words_meaning_list)
  137. words_meaning_str = ";".join([i["spell"] + ":" + i["meaning"] for i in words_meaning_list])
  138. q = f"""不要与前面{different_cou}篇一样,要不同的场景。你是一名在中国的英语教师,下面我会为你提供一些带中文词义的英语单词,请根据这些单词的中文词义,\
  139. 生成带英文标题和中文翻译的考场英语文章,注意这个单词有多个词义时,生成的英语文章一定要用提供的中文词义。并挑选一句复杂的句子和其中文翻译,放入difficultSentences。
  140. 提供单词:{words_meaning_str}
  141. 要求:
  142. 1.必须用提供的这个词义的单词,其他单词使用{select_diffculty}的单词。{desc2}
  143. 2.文章中使用提供单词,一定要和提供单词的中文词义匹配,尤其是一词多义时,务必使用提供单词的词义。必须要用提供单词的词义。如果用到的词义与提供单词词义不一致,请不要使用这个单词。
  144. 3.生成的文章要求{select_word_count}词左右,可以用\\n\\n字符分段,一般{select_paragraph_count}个段落左右。
  145. 4.优先保证文章语句通顺,意思不要太生硬。不要为了使用特定的单词,造成文章语义前后不搭,允许不使用个别词义。
  146. 5.回复json,格式:{{"title":英文标题,"english":英语文章,"chinese":中文翻译,"difficultSentences": [
  147. {{
  148. "english": "",
  149. "chinese": ""
  150. }}
  151. ]}}
  152. """
  153. try:
  154. real_ip = self.real_ip_dict[task_id]
  155. demo_name = self.demo_name[task_id]
  156. r_json = json.loads(
  157. self.ds.get_article(q, temperature=1, json_resp=True, real_ip=real_ip, demo_name=demo_name, max_tokens=8000))
  158. r_json["body"] = r_json["title"] + "\n\n" + r_json["english"]
  159. del r_json["title"]
  160. q_choice_question = f"""你是一名在中国的{grade}英语教师,下面我会为你提供一篇英语短文,请根据短文设计4个选择题.
  161. 短文:{r_json["english"]}
  162. ###要求:
  163. 1. 生成的选择题不要让学生从短文中直接找到答案,可以混淆,最好让学生推理或排除获得正确答案。用词可以{select_diffculty},出题要参考中国的中考高考。
  164. 2.{choice_desc}
  165. 3. 每个选择题之间间隔两行,回复格式如下:
  166. 问题: 英语的选择题问题1文本;
  167. 解析: 中文的选择题答案解析;
  168. 选项:A. 选项1 B. 选项2 C. 选项3 D. 选项4
  169. 答案:B
  170. 其他几个选择题依次类推
  171. """
  172. resp_text = self.ds.get_article(q_choice_question, temperature=1, real_ip=real_ip, demo_name=demo_name, max_tokens=8000)
  173. questions = resp_text.strip().split('\n\n')
  174. parsed_questions = [parse_question(q) for q in questions]
  175. json_data = {"questions": parsed_questions}
  176. allWordAmount = 0
  177. allWordAmount += len(split_text_to_word(r_json["english"]))
  178. for i in json_data["questions"]:
  179. count_trunk = len(split_text_to_word(i["trunk"]))
  180. count_candidates = sum([len(split_text_to_word(ii["text"])) for ii in i["candidates"]])
  181. allWordAmount += count_trunk
  182. allWordAmount += count_candidates
  183. return {**r_json, **json_data, "allWordAmount": allWordAmount}
  184. except json.decoder.JSONDecodeError:
  185. logger.error("gpt生成文章回复json格式化错误")
  186. except Exception as e:
  187. logger.error(f"gpt生成文章回复其他错误.{type(e).__name__} {e}")
  188. def run_get_article_task(self, words_meaning_list, task_id, take_count, student_stage) -> dict:
  189. """
  190. :param words_meaning_list: 数据库内查出来的单词和词义的列表
  191. :param task_id: 任务id
  192. :param take_count: 文章数量
  193. :param student_stage: 学段标识,整型
  194. :return:
  195. """
  196. futures = []
  197. for i in range(take_count):
  198. futures.append(pool_executor.submit(self.get_article, words_meaning_list, student_stage, task_id, take_count))
  199. wait(futures)
  200. return_json = {"articles": []}
  201. for t in futures:
  202. return_json["articles"].append(t.result())
  203. return return_json
  204. def run_task(self, words_meaning_list, task_id, take_count, student_stage):
  205. try:
  206. outside_json = self.run_get_article_task(words_meaning_list, task_id, take_count, student_stage)
  207. logger.success(f"文章3-DeepSeek文章任务完成。taskid:{task_id}")
  208. return outside_json
  209. except Exception as e:
  210. logger.error(f"{type(e).__name__} {e}")
  211. finally:
  212. self.real_ip_dict.pop(task_id)
  213. self.demo_name.pop(task_id)