get_article2.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. # -*- coding: utf-8 -*-
  2. from gpt.chatgpt import get_answer_from_gpt, get_article_gpt_pydantic
  3. from gpt.gpt_check import CheckGptAnswer, CheckArticleResult
  4. from tools.new_mysql import MySQLUploader
  5. from tools.loglog import logger, log_err_e
  6. from tools.thread_pool_manager import pool_executor
  7. from common.common_data import all_exchange_words
  8. from common.split_text import split_text_to_word, get_article_words_count
  9. from pydantic import BaseModel
  10. from cachetools import TTLCache
  11. from concurrent.futures import wait
  12. from random import randint, shuffle, sample
  13. import json, time
  14. import requests
  15. from openpyxl import load_workbook
  16. from tenacity import retry, stop_after_attempt, wait_fixed
  17. import httpx
  18. import asyncio
  19. from threading import Lock
  20. from collections import defaultdict
  21. from fastapi import BackgroundTasks
  22. def get_article_difficulty(article) -> int:
  23. """获取文章的难度值"""
  24. url = "http://qbank.yunzhixue.cn/api/article/analysis"
  25. data = {"body": article, "question": ""}
  26. try:
  27. response = requests.post(url, json=data)
  28. except Exception as e:
  29. log_err_e(e, msg="获取文章难度值;")
  30. return 0
  31. if response.status_code == 200:
  32. difficult_value = response.json()['data']['difficult']
  33. return difficult_value
  34. else:
  35. logger.error(f"错误状态码{response.status_code}")
  36. def find_interval(number) -> int:
  37. """
  38. 判断一个数字属于哪个难度等级区间。31级是例外情况,需要排查
  39. :param number: 要检查的数字。
  40. :return: 返回包含该数字的区间,如果没有找到,则返回 None。
  41. """
  42. intervals = [(1, 200), (201, 250), (251, 300), (301, 350), (351, 400), (401, 450), (451, 550), (551, 650), (651, 750), (751, 850),
  43. (851, 950),
  44. (951, 1100),
  45. (1101, 1250), (1251, 1400), (1401, 1550), (1551, 1700), (1701, 1900), (1901, 2100), (2101, 2300), (2301, 2600),
  46. (2601, 2900),
  47. (2901, 3200),
  48. (3201, 3500), (3501, 3900), (3901, 4300), (4301, 4700), (4701, 5100), (5101, 5500), (5501, 5900), (5901, 6500),
  49. (6501, 99999)]
  50. for index, (start, end) in enumerate(intervals, start=1):
  51. if start <= number <= end:
  52. return index
  53. logger.error(f"文章难度判断不对:{number}")
  54. return 0
  55. def merge_and_split(list1, list2):
  56. combined = list1 + list2
  57. import random
  58. random.shuffle(combined)
  59. two_thirds = []
  60. one_third = []
  61. total_length = len(combined)
  62. if total_length > 15:
  63. two_thirds = combined[:15]
  64. one_third = combined[15:]
  65. else:
  66. two_thirds = combined
  67. one_third = []
  68. return two_thirds, one_third
  69. class GetArticle:
  70. def __init__(self):
  71. self.m = MySQLUploader()
  72. self.callback_url_dict = defaultdict(str)
  73. self.real_ip_dict = defaultdict(str)
  74. self.demo_name = defaultdict(str)
  75. self.article_result = {}
  76. self.punctuation = [",", ".", "!", "?", ":", ";", '"', "–", "_", "-", "...", "......"]
  77. all_exchange_words.update(self.punctuation)
  78. self.exchange_data: dict[str, list] = {}
  79. self.read_spring_bamboo_exchange_table()
  80. def read_spring_bamboo_exchange_table(self):
  81. """变形是键,原型是值"""
  82. wb = load_workbook(r"data/春笋单词对照变形.xlsx", read_only=True, data_only=True)
  83. ws = wb.active
  84. for row in ws.values:
  85. prototype = row[0]
  86. exchange = row[1]
  87. if prototype not in self.exchange_data:
  88. self.exchange_data[prototype] = [exchange]
  89. else:
  90. self.exchange_data[prototype].append(exchange)
  91. wb.close()
  92. def parser_insert_to_mysql(self, resp_result):
  93. try:
  94. for single_article in resp_result['articles']:
  95. article = single_article['body']
  96. article_json = json.dumps(single_article)
  97. difficult_value = find_interval(get_article_difficulty(article))
  98. if not difficult_value:
  99. logger.error("文章难度等级为0;")
  100. sql = "INSERT INTO spring_bamboo_article (article_json,difficult_level) VALUES (%s,%s)"
  101. self.m.execute_(sql, (article_json, difficult_value))
  102. except Exception as e:
  103. logger.error(f"插入数据库时发生错误: {str(e)}")
  104. def submit_task(self, real_ip: str, core_words: list, take_count: int,
  105. demo_name: str, reading_level: int, article_length: int, exercise_id: int,
  106. background_tasks: BackgroundTasks):
  107. """
  108. core_words: 词义数据组
  109. take_count: 取文章数量 (int类型,正常是2篇,最大8篇)
  110. demo_name: 项目名称
  111. reading_level:阅读等级
  112. article_length:文章长度
  113. exercise_id:学案id
  114. background_tasks: FastAPI的后台任务管理器
  115. """
  116. task_id = randint(10000000, 99999999)
  117. logger.info(f"reading-comprehension 生成文章id。学案id:{exercise_id},task_id:{task_id}")
  118. try:
  119. self.real_ip_dict[task_id] = real_ip
  120. self.demo_name[task_id] = demo_name
  121. resp_result = self.run_task(core_words, task_id, exercise_id, take_count, reading_level, article_length)
  122. background_tasks.add_task(self.parser_insert_to_mysql, resp_result)
  123. logger.success(f"reading-comprehension 文章2任务完成。学案id:{exercise_id},taskid:{task_id}")
  124. return resp_result
  125. except Exception as e:
  126. err_msg = f"GetArticle提交任务失败{type(e).__name__},{e}"
  127. log_err_e(e, msg="GetArticle提交任务失败;")
  128. return err_msg
  129. finally:
  130. self.real_ip_dict.pop(task_id, None)
  131. self.demo_name.pop(task_id, None)
  132. def __parse_gpt_resp(self, gpt_resp: dict, core_words: list):
  133. return_json = {"articles": []}
  134. for choice in gpt_resp["choices"]:
  135. single_article_dict = json.loads(choice["message"]["content"])
  136. allWordAmount = 0
  137. articleWordAmount = get_article_words_count(single_article_dict["englishArticle"])
  138. allWordAmount += articleWordAmount
  139. for i in single_article_dict["questions"]:
  140. count_trunk = get_article_words_count(i["trunk"])
  141. count_candidates = sum([get_article_words_count(ii["text"]) for ii in i["candidates"]])
  142. allWordAmount += count_trunk
  143. allWordAmount += count_candidates
  144. usedMeanIds: list = single_article_dict['usedMeanIds']
  145. article_words = split_text_to_word(single_article_dict['englishArticle'])
  146. for i in core_words:
  147. meaning_id = i.get('meaning_id', 0)
  148. if not meaning_id:
  149. continue
  150. word = i["spell"]
  151. if meaning_id not in usedMeanIds and word in self.exchange_data:
  152. words_exchanges_list = self.exchange_data[word]
  153. for exchange_word in words_exchanges_list:
  154. if exchange_word in article_words:
  155. usedMeanIds.append(meaning_id)
  156. break
  157. single_article_dict["body"] = single_article_dict.pop("englishArticle")
  158. single_article_dict["chinese"] = single_article_dict.pop("chineseArticle")
  159. for q in single_article_dict['questions']:
  160. data = q['candidates']
  161. shuffled_candidates = sample(data, len(data))
  162. labels = ['A', 'B', 'C', 'D']
  163. for index, candidate in enumerate(shuffled_candidates):
  164. candidate['label'] = labels[index]
  165. q['candidates'] = shuffled_candidates
  166. return_json['articles'].append({**single_article_dict, "allWordAmount": allWordAmount, "articleWordAmount": articleWordAmount})
  167. return return_json
  168. @retry(stop=stop_after_attempt(3), wait=wait_fixed(2), reraise=True)
  169. def get_article(self, core_words: list, task_id: int, exercise_id: int, reading_level, article_length, n) -> dict:
  170. if not article_length:
  171. if 0 < reading_level <= 10:
  172. article_length = 50 + 10 * reading_level
  173. elif 10 < reading_level <= 20:
  174. article_length = 150 + 30 * (reading_level - 10)
  175. else:
  176. article_length = 450 + 20 * (reading_level - 20)
  177. for index, (start, end) in enumerate([(1, 8), (9, 16), (17, 24), (24, 30)], start=1):
  178. if start <= reading_level <= end:
  179. difficulty_control_stage = index
  180. break
  181. else:
  182. difficulty_control_stage = 2
  183. diffculty_control = {
  184. 1: {"grade": "小学", "desc_difficulty": "最简单最容易没有难度", "paragraph_count": "1-2",
  185. "desc2": "文章整体非常简洁,通俗易懂,适合初学者,刚入门,单词全是最常见的,语句通顺即可。",
  186. "choice_desc": "选择题难度尽可能简单,参考中国小学生水平"},
  187. 2: {"grade": "初中", "desc_difficulty": "简单、常见、难度低", "paragraph_count": "2-3",
  188. "desc2": "文章整体难度适中,大约和中国初中生,中国CET-3,雅思4分这样的难度标准。",
  189. "choice_desc": "选择题难度适中,但是不要所有选择题让其直接在文中找到答案,参考中国初中生水平,中考标准。"},
  190. 3: {"grade": "初中", "desc_difficulty": "简单、常见、难度低", "paragraph_count": "2-3",
  191. "desc2": "文章整体难度适中,大约和中国初中生,中国CET-3,雅思4分这样的难度标准。",
  192. "choice_desc": "选择题难度适中,但是不要所有选择题让其直接在文中找到答案,参考中国初中生水平,中考标准。"},
  193. 4: {"grade": "高中", "desc_difficulty": "常见、高中难度的", "paragraph_count": "3-5",
  194. "desc2": "文章整体难度适中,大约和中国的高中生,中国CET-6,雅思6分这样的难度标准。",
  195. "choice_desc": "选择题难度偏难,要有迷惑性混淆性,答案不要出现直接在文中,4个选项要学生推理或逻辑判断,参考中国高中生水平,高考标准。"}
  196. }
  197. grade = diffculty_control[difficulty_control_stage]["grade"]
  198. select_diffculty = diffculty_control[difficulty_control_stage]["desc_difficulty"]
  199. select_paragraph_count = diffculty_control[difficulty_control_stage]["paragraph_count"]
  200. desc2 = diffculty_control[difficulty_control_stage]["desc2"]
  201. choice_desc = diffculty_control[difficulty_control_stage]["choice_desc"]
  202. shuffle(core_words)
  203. core_words_meaning_str = "; ".join([f"[{i['meaning_id']} {i['spell']} {i['meaning']}]" for i in core_words])
  204. no_escape_code = r"\\n\\n"
  205. sys_prompt = "你是一个专业的英语老师,擅长根据用户提供的词汇生成对应的英语文章和中文翻译和4个配套选择题。"
  206. q = f"""下面我会为你提供一组数据,[单词组](里面包含词义id,英语单词,中文词义),请根据这些单词的中文词义,\
  207. 生成一篇带中文翻译的考场英语文章,英语文章和中文翻译要有[标题]。特别注意这个单词有多个词义时,生成的英语文章一定要用提供的中文词义,例如我提供单词[change 零钱],就不要使用[变化]的词义。
  208. 要求:
  209. 1.必须用提供的这个词义的单词,其他单词使用{select_diffculty}的单词。{desc2}{choice_desc}
  210. 2.优先保证文章语句通顺,意思不要太生硬。不要为了使用特定的单词,造成文章语义前后不搭,允许不使用个别词义。
  211. 3.文章中使用提供单词,一定要和提供单词的中文词义匹配,尤其是一词多义时,务必使用提供单词的词义。必须要用提供单词的词义。如果用到的词义与提供单词词义不一致,请不要使用这个单词。
  212. 4.生成的文章要求{article_length}词左右,可以用{no_escape_code}字符分段,一般{select_paragraph_count}个段落左右。第一段是文章标题。不需要markdown格式。
  213. 5.允许不使用[单词组]的个别单词,优先保证文章整体意思通顺连贯和故事完整。
  214. 提供[单词组]:{core_words_meaning_str};
  215. """
  216. try:
  217. real_ip = self.real_ip_dict[task_id]
  218. demo_name = self.demo_name[task_id]
  219. gpt_resp = get_article_gpt_pydantic(q, temperature=1.2, real_ip=real_ip, demo_name=demo_name, model='gpt-4.1',
  220. check_fucn=CheckArticleResult.get_article_1, max_tokens=15000,
  221. sys_prompt=sys_prompt, n=n, task_id=task_id, exercise_id=exercise_id)
  222. multi_articles_dict = self.__parse_gpt_resp(gpt_resp=gpt_resp, core_words=core_words)
  223. return multi_articles_dict
  224. except httpx.HTTPError as e:
  225. logger.error(f"HTTP请求错误: {str(e)}")
  226. raise
  227. except json.JSONDecodeError as e:
  228. logger.error(f"JSON解析错误: {str(e)}")
  229. raise
  230. except Exception as e:
  231. log_err_e(e, f"gpt生成文章回复其他错误.")
  232. raise
  233. def run_get_article_task(self, core_words, task_id, exercise_id, take_count, reading_level, article_length) -> dict:
  234. """
  235. :param core_words: 核心单词数据,优先级1;可能为空
  236. :param task_id: 任务id
  237. :param take_count: 文章数量
  238. :param reading_level:阅读等级
  239. :param article_length:文章长度
  240. :return:
  241. """
  242. try:
  243. return_json = self.get_article(core_words, task_id, exercise_id, reading_level, article_length, n=take_count)
  244. return return_json
  245. except Exception as e:
  246. logger.error(f"运行文章任务时发生错误: {str(e)}")
  247. raise
  248. def run_task(self, core_words, task_id, exercise_id, take_count, reading_level, article_length):
  249. try:
  250. outside_json = self.run_get_article_task(core_words, task_id, exercise_id, take_count, reading_level, article_length)
  251. return outside_json
  252. except Exception as e:
  253. log_err_e(e, msg="外层总任务捕获错误")
  254. def cleanup(self):
  255. """清理所有资源"""
  256. pass