get_article2.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. # -*- coding: utf-8 -*-
  2. from gpt.chatgpt import get_answer_from_gpt
  3. from gpt.gpt_check import CheckGptAnswer, CheckArticleResult
  4. from tools.new_mysql import MySQLUploader
  5. from tools.loglog import logger, log_err_e
  6. from tools.thread_pool_manager import pool_executor
  7. from common.common_data import all_exchange_words
  8. from common.split_text import split_text_to_word
  9. from pydantic import BaseModel
  10. from cachetools import TTLCache
  11. from concurrent.futures import wait
  12. from random import randint, shuffle
  13. import json
  14. import requests
  15. from openpyxl import load_workbook
  16. from tenacity import retry, stop_after_attempt, wait_fixed
  17. def get_article_difficulty(article) -> int:
  18. """获取文章的难度值"""
  19. url = "http://qbank.yunzhixue.cn/api/article/analysis"
  20. data = {"body": article, "question": ""}
  21. try:
  22. response = requests.post(url, json=data)
  23. except Exception as e:
  24. log_err_e(e, msg="获取文章难度值;")
  25. return 0
  26. if response.status_code == 200:
  27. difficult_value = response.json()['data']['difficult']
  28. return difficult_value
  29. else:
  30. logger.error(f"错误状态码{response.status_code}")
  31. def find_interval(number) -> int:
  32. """
  33. 判断一个数字属于哪个难度等级区间。31级是例外情况,需要排查
  34. :param number: 要检查的数字。
  35. :return: 返回包含该数字的区间,如果没有找到,则返回 None。
  36. """
  37. intervals = [(1, 200), (201, 250), (251, 300), (301, 350), (351, 400), (401, 450), (451, 550), (551, 650), (651, 750), (751, 850), (851, 950),
  38. (951, 1100),
  39. (1101, 1250), (1251, 1400), (1401, 1550), (1551, 1700), (1701, 1900), (1901, 2100), (2101, 2300), (2301, 2600), (2601, 2900),
  40. (2901, 3200),
  41. (3201, 3500), (3501, 3900), (3901, 4300), (4301, 4700), (4701, 5100), (5101, 5500), (5501, 5900), (5901, 6500), (6501, 99999)]
  42. for index, (start, end) in enumerate(intervals, start=1):
  43. if start <= number <= end:
  44. return index
  45. logger.error(f"文章难度判断不对:{number}")
  46. return 0
  47. def merge_and_split(list1, list2):
  48. combined = list1 + list2
  49. import random
  50. random.shuffle(combined)
  51. two_thirds = []
  52. one_third = []
  53. total_length = len(combined)
  54. if total_length > 15:
  55. two_thirds = combined[:15]
  56. one_third = combined[15:]
  57. else:
  58. two_thirds = combined
  59. one_third = []
  60. return two_thirds, one_third
  61. class GetArticle:
  62. def __init__(self):
  63. self.m = MySQLUploader()
  64. self.callback_url_dict = {}
  65. self.real_ip_dict = {}
  66. self.demo_name = {}
  67. self.article_result = {}
  68. self.punctuation = [",", ".", "!", "?", ":", ";", '"', "–", "_", "-", "...", "......"]
  69. all_exchange_words.update(self.punctuation)
  70. self.exchange_data: dict[str, list] = {}
  71. self.read_spring_bamboo_exchange_table()
  72. def read_spring_bamboo_exchange_table(self):
  73. """变形是键,原型是值"""
  74. wb = load_workbook(r"data/春笋单词对照变形.xlsx", read_only=True, data_only=True)
  75. ws = wb.active
  76. for row in ws.values:
  77. prototype = row[0]
  78. exchange = row[1]
  79. if prototype not in self.exchange_data:
  80. self.exchange_data[prototype] = [exchange]
  81. else:
  82. self.exchange_data[prototype].append(exchange)
  83. wb.close()
  84. def parser_insert_to_mysql(self, resp_result):
  85. for single_article in resp_result['articles']:
  86. article = single_article['body']
  87. article_json = json.dumps(single_article)
  88. difficult_value = find_interval(get_article_difficulty(article))
  89. if not difficult_value:
  90. logger.error("文章难度等级为0;")
  91. sql = "INSERT INTO spring_bamboo_article (article_json,difficult_level) VALUES (%s,%s)"
  92. self.m.execute_(sql, (article_json, difficult_value))
  93. def submit_task(self, core_words: list, extend_words: list, take_count: int, student_stage: int, real_ip: str,
  94. demo_name: str, article_difficulty: int):
  95. """
  96. words_meaning_list: 词义id 包含词义ID的数组集合,用于生成文章。- 示例:[110, 111, 112, 113, 114]
  97. take_count: 取文章数量 (int类型,正常是2篇,最大8篇)
  98. student_stage: 学段(int类型:1.小学;2.初中;3.高中;)
  99. demo_name: 项目名称
  100. article_difficulty:文章难度值1-4200模糊范围
  101. """
  102. task_id = randint(10000000, 99999999)
  103. logger.info(f"生成文章id。task_id:{task_id}")
  104. self.real_ip_dict[task_id] = real_ip
  105. self.demo_name[task_id] = demo_name
  106. try:
  107. resp_result = self.run_task(core_words, extend_words, task_id, take_count, student_stage, article_difficulty)
  108. self.parser_insert_to_mysql(resp_result)
  109. return resp_result
  110. except Exception as e:
  111. err_msg = f"GetArticle提交任务失败{type(e).__name__},{e}"
  112. log_err_e(e, msg="GetArticle提交任务失败;")
  113. return err_msg
  114. @retry(stop=stop_after_attempt(2), wait=wait_fixed(3), reraise=True)
  115. def get_article(self, core_words: list, extend_words: list, student_stage: int, task_id: int, take_count: int, article_difficulty) -> dict:
  116. article_grade = find_interval(article_difficulty)
  117. if 0 < article_grade <= 10:
  118. article_word_count = 50 + 10 * article_grade
  119. elif 10 < article_grade <= 20:
  120. article_word_count = 150 + 30 * (article_grade - 10)
  121. else:
  122. article_word_count = 450 + 20 * (article_grade - 20)
  123. diffculty_control = {
  124. 1: {"grade": "小学", "article_word_count": article_word_count, "desc_difficulty": "最简单最容易没有难度", "paragraph_count": "1-2",
  125. "desc2": "文章整体非常简洁,通俗易懂,适合初学者,刚入门,单词全是最常见的,语句通顺即可。",
  126. "choice_desc": "选择题难度尽可能简单,但是不要让所有选择题让其直接在文中找到答案,允许1-2个选择题很简单,参考中国小学生水平"},
  127. 2: {"grade": "初中", "article_word_count": article_word_count, "desc_difficulty": "简单、常见、难度低", "paragraph_count": "2-3",
  128. "desc2": "文章整体难度适中,大约和中国初中生,中国CET-3,雅思4分这样的难度标准。",
  129. "choice_desc": "选择题难度适中,但是不要所有选择题让其直接在文中找到答案,参考中国初中生水平,中考标准。"},
  130. 3: {"grade": "初中", "article_word_count": article_word_count, "desc_difficulty": "简单、常见、难度低", "paragraph_count": "2-3",
  131. "desc2": "文章整体难度适中,大约和中国初中生,中国CET-3,雅思4分这样的难度标准。",
  132. "choice_desc": "选择题难度适中,但是不要所有选择题让其直接在文中找到答案,参考中国初中生水平,中考标准。"},
  133. 4: {"grade": "高中", "article_word_count": article_word_count, "desc_difficulty": "常见、高中难度的", "paragraph_count": "3-5",
  134. "desc2": "文章整体难度适中,大约和中国的高中生,中国CET-6,雅思6分这样的难度标准。",
  135. "choice_desc": "选择题难度偏难,要有迷惑性混淆性,答案不要出现直接在文中,4个选项要学生推理或逻辑判断,参考中国高中生水平,高考标准。"}
  136. }
  137. take_count_dict = {0: "", 1: "一", 2: "二", 3: "三", 4: "四", 5: "五", 6: "六", 7: "七", 8: "八", 9: "九"}
  138. different_cou = take_count_dict.get(take_count, "")
  139. grade = diffculty_control[student_stage]["grade"]
  140. select_word_count = diffculty_control[student_stage]["article_word_count"]
  141. select_diffculty = diffculty_control[student_stage]["desc_difficulty"]
  142. select_paragraph_count = diffculty_control[student_stage]["paragraph_count"]
  143. desc2 = diffculty_control[student_stage]["desc2"]
  144. choice_desc = diffculty_control[student_stage]["choice_desc"]
  145. shuffle(core_words)
  146. core_words_meaning_str = ";".join([str(i['meaning_id']) + ' ' + i["spell"] + ":" + i["meaning"] for i in core_words])
  147. extend_words_meaning_str = ";".join([str(i['meaning_id']) + ' ' + i["spell"] + ":" + i["meaning"] for i in extend_words])
  148. no_escape_code = r"\\n\\n"
  149. json_model = r'{"difficultSentences":[{"english":"string","chinese":"string"}],"usedMeanIds":[0,0,0],"englishArticle":"string","chineseArticle":"string","questions":[{"trunk":"string","analysis":"string","candidates":[{"label":"string","text":"string","isRight":0}]}]}'
  150. sys_prompt = "你是一个专业的英语老师,擅长根据用户提供的词汇生成对应的英语文章和中文翻译和4个配套选择题。"
  151. q = f"""下面我会为你提供两组数据,[单词组1]和[单词组2](里面包含词义id,英语单词,中文词义),优先使用[单词组1]内的单词,请根据这些单词的中文词义,\
  152. 生成一篇带中文翻译的考场英语文章,英语文章和中文翻译要有[标题]。注意这个单词有多个词义时,生成的英语文章一定要用提供的中文词义。并挑选一句复杂的句子和其中文翻译,放入difficultSentences。\
  153. 英语文章,放入"englishArticle"中。中文翻译,放入"chineseArticle"中。最终文中使用到的单词id放入"usedMeanIds"中。\
  154. 4个选择题,放入questions字段。questions结构下有4个选择题对象,其中trunk是[英语]问题文本,analysis是[中文]的问题分析,candidates是4个ABCD选项,内部有label是指选项序号A B C D ,text是[英语]选项文本,isRight是否正确答案1是正确0是错误。
  155. 要求:
  156. 1.必须用提供的这个词义的单词,其他单词使用{select_diffculty}的单词。{desc2}{choice_desc}
  157. 2.优先保证文章语句通顺,意思不要太生硬。不要为了使用特定的单词,造成文章语义前后不搭,允许不使用个别词义。
  158. 3.文章中使用提供单词,一定要和提供单词的中文词义匹配,尤其是一词多义时,务必使用提供单词的词义。必须要用提供单词的词义。如果用到的词义与提供单词词义不一致,请不要使用这个单词。
  159. 4.生成的文章要求{select_word_count}词左右,可以用{no_escape_code}字符分段,一般{select_paragraph_count}个段落左右。第一段是文章标题。
  160. 5.生成文章优先使用[单词组1]的词义,其次可以挑选使用[单词组2]的词义。允许不使用[单词组1]的个别单词,优先保证文章整体意思通顺连贯和故事完整。
  161. 6.回复紧凑无空格的json数据,示例:{json_model}
  162. 提供[单词组1]:{core_words_meaning_str};
  163. 提供[单词组2]:{extend_words_meaning_str};
  164. """
  165. try:
  166. real_ip = self.real_ip_dict[task_id]
  167. demo_name = self.demo_name[task_id]
  168. r_json = json.loads(get_answer_from_gpt(q, temperature=1, json_resp=True, real_ip=real_ip, demo_name=demo_name, model='gpt-4.1',
  169. check_fucn=CheckArticleResult.get_article_1, max_tokens=8000, sys_prompt=sys_prompt))
  170. allWordAmount = 0
  171. allWordAmount += len(split_text_to_word(r_json["englishArticle"]))
  172. for i in r_json["questions"]:
  173. count_trunk = len(split_text_to_word(i["trunk"]))
  174. count_candidates = sum([len(split_text_to_word(ii["text"])) for ii in i["candidates"]])
  175. allWordAmount += count_trunk
  176. allWordAmount += count_candidates
  177. usedMeanIds: list = r_json['usedMeanIds']
  178. article_words = split_text_to_word(r_json['englishArticle'])
  179. for i in core_words + extend_words:
  180. meaning_id = i.get('meaning_id', 0)
  181. if not meaning_id:
  182. continue
  183. word = i["spell"]
  184. if meaning_id not in usedMeanIds and word in self.exchange_data:
  185. words_exchanges_list = self.exchange_data[word]
  186. for exchange_word in words_exchanges_list:
  187. if exchange_word in article_words:
  188. usedMeanIds.append(meaning_id)
  189. break
  190. r_json["body"] = r_json.pop("englishArticle")
  191. r_json["chinese"] = r_json.pop("chineseArticle")
  192. return {**r_json, "allWordAmount": allWordAmount}
  193. except json.decoder.JSONDecodeError:
  194. logger.error("gpt生成文章回复json格式化错误")
  195. raise
  196. except Exception as e:
  197. logger.error(f"gpt生成文章回复其他错误.{type(e).__name__} {e}")
  198. raise
  199. def run_get_article_task(self, core_words, extend_words, task_id, take_count, student_stage, article_difficulty) -> dict:
  200. """
  201. :param core_words: 核心单词数据,优先级1;可能为空
  202. :param extend_words: 扩展单词数据,优先级2;可能为空
  203. :param task_id: 任务id
  204. :param take_count: 文章数量
  205. :param student_stage: 学段标识,整型,123
  206. :param article_difficulty:文章难度1-4200模糊范围
  207. :return:
  208. """
  209. futures = []
  210. for i in range(take_count):
  211. futures.append(pool_executor.submit(self.get_article, core_words, extend_words, student_stage, task_id, take_count, article_difficulty))
  212. wait(futures)
  213. return_json = {"articles": []}
  214. for t in futures:
  215. return_json["articles"].append(t.result())
  216. return return_json
  217. def run_task(self, core_words, extend_words, task_id, take_count, student_stage, article_difficulty):
  218. try:
  219. outside_json = self.run_get_article_task(core_words, extend_words, task_id, take_count, student_stage, article_difficulty)
  220. logger.success(f"文章2任务完成。taskid:{task_id}\n{outside_json}")
  221. return outside_json
  222. except Exception as e:
  223. logger.error(f"{type(e).__name__} {e}")
  224. finally:
  225. self.real_ip_dict.pop(task_id)
  226. self.demo_name.pop(task_id)