get_article2.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. # -*- coding: utf-8 -*-
  2. import json
  3. from collections import defaultdict
  4. from random import randint, shuffle, sample
  5. import httpx
  6. import requests
  7. from fastapi import BackgroundTasks
  8. from openpyxl import load_workbook
  9. from tenacity import retry, stop_after_attempt, wait_fixed
  10. from common.common_data import all_exchange_words
  11. from common.split_text import split_text_to_word, get_article_words_count
  12. from gpt.chatgpt import get_article_gpt_pydantic
  13. from gpt.gpt_check import CheckArticleResult
  14. from tools.loglog import logger, log_err_e
  15. from tools.new_mysql import MySQLUploader
  16. def get_article_difficulty(article) -> int:
  17. """获取文章的难度值"""
  18. url = "http://qbank.yunzhixue.cn/api/article/analysis"
  19. data = {"body": article, "question": ""}
  20. try:
  21. response = requests.post(url, json=data)
  22. except Exception as e:
  23. log_err_e(e, msg="获取文章难度值;")
  24. return 0
  25. if response.status_code == 200:
  26. difficult_value = response.json()['data']['difficult']
  27. return difficult_value
  28. else:
  29. logger.error(f"错误状态码{response.status_code}")
  30. def find_interval(number) -> int:
  31. """
  32. 判断一个数字属于哪个难度等级区间。31级是例外情况,需要排查
  33. :param number: 要检查的数字。
  34. :return: 返回包含该数字的区间,如果没有找到,则返回 None。
  35. """
  36. intervals = [(1, 200), (201, 250), (251, 300), (301, 350), (351, 400), (401, 450), (451, 550), (551, 650), (651, 750), (751, 850),
  37. (851, 950),
  38. (951, 1100),
  39. (1101, 1250), (1251, 1400), (1401, 1550), (1551, 1700), (1701, 1900), (1901, 2100), (2101, 2300), (2301, 2600),
  40. (2601, 2900),
  41. (2901, 3200),
  42. (3201, 3500), (3501, 3900), (3901, 4300), (4301, 4700), (4701, 5100), (5101, 5500), (5501, 5900), (5901, 6500),
  43. (6501, 99999)]
  44. for index, (start, end) in enumerate(intervals, start=1):
  45. if start <= number <= end:
  46. return index
  47. logger.error(f"文章难度判断不对:{number}")
  48. return 0
  49. def merge_and_split(list1, list2):
  50. combined = list1 + list2
  51. import random
  52. random.shuffle(combined)
  53. two_thirds = []
  54. one_third = []
  55. total_length = len(combined)
  56. if total_length > 15:
  57. two_thirds = combined[:15]
  58. one_third = combined[15:]
  59. else:
  60. two_thirds = combined
  61. one_third = []
  62. return two_thirds, one_third
  63. class GetArticle:
  64. def __init__(self):
  65. self.m = MySQLUploader()
  66. self.callback_url_dict = defaultdict(str)
  67. self.real_ip_dict = defaultdict(str)
  68. self.demo_name = defaultdict(str)
  69. self.article_result = {}
  70. self.punctuation = [",", ".", "!", "?", ":", ";", '"', "–", "_", "-", "...", "......"]
  71. all_exchange_words.update(self.punctuation)
  72. self.exchange_data: dict[str, list] = {}
  73. self.read_spring_bamboo_exchange_table()
  74. def read_spring_bamboo_exchange_table(self):
  75. """变形是键,原型是值"""
  76. wb = load_workbook(r"data/春笋单词对照变形.xlsx", read_only=True, data_only=True)
  77. ws = wb.active
  78. for row in ws.values:
  79. prototype = row[0]
  80. exchange = row[1]
  81. if prototype not in self.exchange_data:
  82. self.exchange_data[prototype] = [exchange]
  83. else:
  84. self.exchange_data[prototype].append(exchange)
  85. wb.close()
  86. def parser_insert_to_mysql(self, resp_result):
  87. try:
  88. for single_article in resp_result['articles']:
  89. article = single_article['body']
  90. article_json = json.dumps(single_article)
  91. difficult_value = find_interval(get_article_difficulty(article))
  92. if not difficult_value:
  93. logger.error("文章难度等级为0;")
  94. sql = "INSERT INTO spring_bamboo_article (article_json,difficult_level) VALUES (%s,%s)"
  95. self.m.execute_(sql, (article_json, difficult_value))
  96. except Exception as e:
  97. logger.error(f"插入数据库时发生错误: {str(e)}")
  98. def submit_task(self, real_ip: str, core_words: list, take_count: int,
  99. demo_name: str, reading_level: int, article_length: int, exercise_id: int,
  100. background_tasks: BackgroundTasks):
  101. """
  102. core_words: 词义数据组
  103. take_count: 取文章数量 (int类型,正常是2篇,最大8篇)
  104. demo_name: 项目名称
  105. reading_level:阅读等级
  106. article_length:文章长度
  107. exercise_id:学案id
  108. background_tasks: FastAPI的后台任务管理器
  109. """
  110. task_id = randint(10000000, 99999999)
  111. logger.info(f"reading-comprehension 生成文章id。学案id:{exercise_id},task_id:{task_id}")
  112. try:
  113. self.real_ip_dict[task_id] = real_ip
  114. self.demo_name[task_id] = demo_name
  115. resp_result = self.run_task(core_words, task_id, exercise_id, take_count, reading_level, article_length)
  116. background_tasks.add_task(self.parser_insert_to_mysql, resp_result)
  117. logger.success(f"reading-comprehension 文章2任务完成。学案id:{exercise_id},taskid:{task_id}")
  118. return resp_result
  119. except Exception as e:
  120. err_msg = f"GetArticle提交任务失败{type(e).__name__},{e}"
  121. log_err_e(e, msg="GetArticle提交任务失败;")
  122. return err_msg
  123. finally:
  124. self.real_ip_dict.pop(task_id, None)
  125. self.demo_name.pop(task_id, None)
  126. def __parse_gpt_resp(self, gpt_resp: dict, core_words: list):
  127. return_json = {"articles": []}
  128. for choice in gpt_resp["choices"]:
  129. single_article_dict = json.loads(choice["message"]["content"])
  130. allWordAmount = 0
  131. articleWordAmount = get_article_words_count(single_article_dict["englishArticle"])
  132. allWordAmount += articleWordAmount
  133. for i in single_article_dict["questions"]:
  134. count_trunk = get_article_words_count(i["trunk"])
  135. count_candidates = sum([get_article_words_count(ii["text"]) for ii in i["candidates"]])
  136. allWordAmount += count_trunk
  137. allWordAmount += count_candidates
  138. usedMeanIds: list = single_article_dict['usedMeanIds']
  139. article_words = split_text_to_word(single_article_dict['englishArticle'])
  140. for i in core_words:
  141. meaning_id = i.get('meaning_id', 0)
  142. if not meaning_id:
  143. continue
  144. word = i["spell"]
  145. if meaning_id not in usedMeanIds and word in self.exchange_data:
  146. words_exchanges_list = self.exchange_data[word]
  147. for exchange_word in words_exchanges_list:
  148. if exchange_word in article_words:
  149. usedMeanIds.append(meaning_id)
  150. break
  151. single_article_dict["body"] = single_article_dict.pop("englishArticle")
  152. single_article_dict["chinese"] = single_article_dict.pop("chineseArticle")
  153. for q in single_article_dict['questions']:
  154. data = q['candidates']
  155. shuffled_candidates = sample(data, len(data))
  156. labels = ['A', 'B', 'C', 'D']
  157. for index, candidate in enumerate(shuffled_candidates):
  158. candidate['label'] = labels[index]
  159. q['candidates'] = shuffled_candidates
  160. return_json['articles'].append({**single_article_dict, "allWordAmount": allWordAmount, "articleWordAmount": articleWordAmount})
  161. return return_json
  162. @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), reraise=True)
  163. def get_article(self, core_words: list, task_id: int, exercise_id: int, reading_level, article_length, n) -> dict:
  164. if not article_length:
  165. if 0 < reading_level <= 10:
  166. article_length = 50 + 10 * reading_level
  167. elif 10 < reading_level <= 20:
  168. article_length = 150 + 30 * (reading_level - 10)
  169. else:
  170. article_length = 450 + 20 * (reading_level - 20)
  171. for index, (start, end) in enumerate([(1, 8), (9, 16), (17, 24), (24, 30)], start=1):
  172. if start <= reading_level <= end:
  173. difficulty_control_stage = index
  174. break
  175. else:
  176. difficulty_control_stage = 2
  177. diffculty_control = {
  178. 1: {"grade": "小学", "desc_difficulty": "最简单最容易没有难度", "paragraph_count": "1-2",
  179. "desc2": "文章整体非常简洁,通俗易懂,适合初学者,刚入门,单词全是最常见的,语句通顺即可。",
  180. "choice_desc": "选择题难度尽可能简单,参考中国小学生水平"},
  181. 2: {"grade": "初中", "desc_difficulty": "简单、常见、难度低", "paragraph_count": "2-3",
  182. "desc2": "文章整体难度适中,大约和中国初中生,中国CET-3,雅思4分这样的难度标准。",
  183. "choice_desc": "选择题难度适中,但是不要所有选择题让其直接在文中找到答案,参考中国初中生水平,中考标准。"},
  184. 3: {"grade": "初中", "desc_difficulty": "简单、常见、难度低", "paragraph_count": "2-3",
  185. "desc2": "文章整体难度适中,大约和中国初中生,中国CET-3,雅思4分这样的难度标准。",
  186. "choice_desc": "选择题难度适中,但是不要所有选择题让其直接在文中找到答案,参考中国初中生水平,中考标准。"},
  187. 4: {"grade": "高中", "desc_difficulty": "常见、高中难度的", "paragraph_count": "3-5",
  188. "desc2": "文章整体难度适中,大约和中国的高中生,中国CET-6,雅思6分这样的难度标准。",
  189. "choice_desc": "选择题难度偏难,要有迷惑性混淆性,答案不要出现直接在文中,4个选项要学生推理或逻辑判断,参考中国高中生水平,高考标准。"}
  190. }
  191. grade = diffculty_control[difficulty_control_stage]["grade"]
  192. select_diffculty = diffculty_control[difficulty_control_stage]["desc_difficulty"]
  193. select_paragraph_count = diffculty_control[difficulty_control_stage]["paragraph_count"]
  194. desc2 = diffculty_control[difficulty_control_stage]["desc2"]
  195. choice_desc = diffculty_control[difficulty_control_stage]["choice_desc"]
  196. shuffle(core_words)
  197. core_words_meaning_str = "; ".join([f"[{i['meaning_id']} {i['spell']} {i['meaning']}]" for i in core_words])
  198. no_escape_code = r"\\n\\n"
  199. sys_prompt = "你是一个专业的英语老师,擅长根据用户提供的词汇生成对应的英语文章和中文翻译和4个配套选择题。"
  200. q = f"""下面我会为你提供一组数据,[单词组](里面包含词义id,英语单词,中文词义),请根据这些单词的中文词义,\
  201. 生成一篇带中文翻译的考场英语文章,英语文章和中文翻译要有[标题]。特别注意这个单词有多个词义时,生成的英语文章一定要用提供的中文词义,例如我提供单词[change 零钱],就不要使用[变化]的词义。
  202. 要求:
  203. 1.必须用提供的这个词义的单词,其他单词使用{select_diffculty}的单词。{desc2}{choice_desc}
  204. 2.优先保证文章语句通顺,意思不要太生硬。不要为了使用特定的单词,造成文章语义前后不搭,允许不使用个别词义。
  205. 3.文章中使用提供单词,一定要和提供单词的中文词义匹配,尤其是一词多义时,务必使用提供单词的词义。必须要用提供单词的词义。如果用到的词义与提供单词词义不一致,请不要使用这个单词。
  206. 4.生成的文章要求{article_length}词左右,可以用{no_escape_code}字符分段,一般{select_paragraph_count}个段落左右。第一段是文章标题。不需要markdown格式。
  207. 5.允许不使用[单词组]的个别单词,优先保证文章整体意思通顺连贯和故事完整。
  208. 提供[单词组]:{core_words_meaning_str};
  209. """
  210. try:
  211. real_ip = self.real_ip_dict[task_id]
  212. demo_name = self.demo_name[task_id]
  213. gpt_resp = get_article_gpt_pydantic(q, temperature=1.2, real_ip=real_ip, demo_name=demo_name, model='gpt-4.1',
  214. check_fucn=CheckArticleResult.get_article_1, max_tokens=15000,
  215. sys_prompt=sys_prompt, n=n, task_id=task_id, exercise_id=exercise_id)
  216. multi_articles_dict = self.__parse_gpt_resp(gpt_resp=gpt_resp, core_words=core_words)
  217. return multi_articles_dict
  218. except httpx.HTTPError as e:
  219. logger.error(f"HTTP请求错误: {str(e)}")
  220. raise
  221. except json.JSONDecodeError as e:
  222. logger.error(f"JSON解析错误: {str(e)}")
  223. raise
  224. except Exception as e:
  225. log_err_e(e, f"gpt生成文章回复其他错误.")
  226. raise
  227. def run_get_article_task(self, core_words, task_id, exercise_id, take_count, reading_level, article_length) -> dict:
  228. """
  229. :param core_words: 核心单词数据,优先级1;可能为空
  230. :param task_id: 任务id
  231. :param take_count: 文章数量
  232. :param reading_level:阅读等级
  233. :param article_length:文章长度
  234. :return:
  235. """
  236. try:
  237. return_json = self.get_article(core_words, task_id, exercise_id, reading_level, article_length, n=take_count)
  238. return return_json
  239. except Exception as e:
  240. logger.error(f"运行文章任务时发生错误: {str(e)}")
  241. raise
  242. def run_task(self, core_words, task_id, exercise_id, take_count, reading_level, article_length):
  243. try:
  244. outside_json = self.run_get_article_task(core_words, task_id, exercise_id, take_count, reading_level, article_length)
  245. return outside_json
  246. except Exception as e:
  247. log_err_e(e, msg="外层总任务捕获错误")
  248. def cleanup(self):
  249. """清理所有资源"""
  250. pass