# -*- coding: utf-8 -*- """ 1.对拿到的文章或句子,进行单词的切割;考虑,如何是大文本应该进行小段切割,防止巨量文本的传入;遇到巨量文本,切割后分多个问题,调用正常的运行流程;把custom_id的后缀加上123456标记 2.判断文章是否有缩写;将缩写还原,缩写标注第一个单词;例如it’s 标注it的词义id; 缩写获得的形式,写入mysql的缩写表 3.获取每个单词的词义数据包; 20250522 文章标注接口,分为2个接口,传文本同步返回,传文章对象异步返回。 下面这些情况,暂时不标注;1.缩写 2.连字符 3.春笋词义表内没有数据 注意:传入的文本将使用空格切分,连在一起的字符会被当做一个单词处理; """ import json from random import randint from cachetools import TTLCache from openpyxl import load_workbook import warnings from core.respone_format import * from common.split_text import split_text_to_word from data.get_all_exchange_words import word_to_prototype from gpt.chatgpt import get_annotation_gpt_pydantic from tools.loglog import log_err_e,logger from tools.thread_pool_manager import pool_executor class Annotation: def __init__(self): self.all_task_data: dict[int, list] = {} self.all_task_result = TTLCache(maxsize=1000,ttl=3600) self.word_meaning_dict: dict[str, list[tuple[int, str, str]]] = {} self.prototype_words = set() self.change_prototype_dict = {} self.get_excel_meaning_data() self.get_excel_change_data() def submit_task(self, english_text,split_blank,real_ip): task_id = randint(10000000, 99999999) logger.info(f"/article/annotation 生成id。task_id:{task_id},split_blank:{split_blank},real_ip:{real_ip}") f = pool_executor.submit(self.main_annotation,task_id, english_text,split_blank) r = f.result() return r def __run(self): warnings.warn("废弃函数",DeprecationWarning,stacklevel=2) def main_annotation(self, task_id:int, english_text:str,split_blank:bool): if split_blank: split_words = english_text.split() else: split_words = split_text_to_word(english_text, split_hyphen=False) meanings_data = self.query_meanings_data(split_words=split_words) result_annotation = self.__ai_annotation(english_text=english_text,meanings_data=meanings_data) self.all_task_result[task_id] = result_annotation return result_annotation async def query_result_by_taskid(self,task_id): if task_id in self.all_task_result: r = self.all_task_result[task_id] return resp_200(data=r) return resp_200(data={}) def get_excel_meaning_data(self): """读取外部的春笋词义表,结构化到字典;单词为键,值[((词义id,中文词义))]""" spring_bamboo_meaning_path = "data/春笋词义表.xlsx" wb = load_workbook(spring_bamboo_meaning_path, read_only=True, data_only=True) ws = wb.active try: for index, row in enumerate(ws.values, start=1): if index == 1: continue word = row[3] id_and_meaning = (row[0], word, row[2]) if word not in self.word_meaning_dict: self.word_meaning_dict[word] = [id_and_meaning] else: self.word_meaning_dict[word].append(id_and_meaning) except Exception as e: log_err_e(e, msg="打开春笋词义表错误") finally: wb.close() def get_excel_change_data(self): """读取外部的春笋变形表""" spring_bamboo_change_path = "data/春笋单词对照变形.xlsx" wb = load_workbook(spring_bamboo_change_path, read_only=True, data_only=True) ws = wb.active try: for row in ws.values: word_prototype = row[0] word_change = row[1] self.prototype_words.add(word_prototype) self.change_prototype_dict[word_change] = word_prototype except Exception as e: log_err_e(e, msg="打开春笋变形表错误") finally: wb.close() def to_prototype_word(self,word): lower_word = word.lower() if word in self.prototype_words: w_prototype = word elif lower_word in self.prototype_words: w_prototype = lower_word elif word in self.change_prototype_dict: w_prototype = self.change_prototype_dict[word] elif lower_word in self.change_prototype_dict: w_prototype = self.change_prototype_dict[lower_word] else: w_prototype = word_to_prototype(word) return w_prototype def __query_meaning(self, word: str) -> str: """ :param word: 单个单词 :return: 加工好的词义文本 """ meaning_data1 = [] if word in self.word_meaning_dict: meaning_data1.extend(self.word_meaning_dict[word]) meaning_data_str = "".join([f"[{i[0]} {i[1]} {i[2]}]" for i in meaning_data1]) return meaning_data_str elif word.lower() in self.word_meaning_dict: meaning_data1.extend(self.word_meaning_dict[word.lower()]) meaning_data_str = "".join([f"[{i[0]} {i[1]} {i[2]}]" for i in meaning_data1]) return meaning_data_str w_prototype = self.to_prototype_word(word) key_to_check = w_prototype if w_prototype in self.word_meaning_dict else w_prototype.lower() if key_to_check in self.word_meaning_dict: meaning_data = self.word_meaning_dict[key_to_check] meaning_data1.extend(meaning_data) meaning_data1 = list(set(meaning_data1)) meaning_data_str = "".join([f"[{i[0]} {i[1]} {i[2]}]" for i in meaning_data1]) return meaning_data_str def query_meanings_data(self, split_words: list): """ 查询所有单词的词义数据包 :param split_words: 文章或句子被切割后的单词列表,连字符也拆开 :return: """ all_words_meaning_list = set() for word in split_words: result_query_meaning:str = self.__query_meaning(word) if result_query_meaning: all_words_meaning_list.add(f"【{word} {result_query_meaning}】") new_data_str = "\n词义数据包:\n" + "\n".join(all_words_meaning_list) + "\n\n" return new_data_str @staticmethod def __parse_gpt_resp(gpt_resp:dict): """ 解析ai-gpt的回复 :param gpt_resp: GPT原始的回复 :return: """ r = json.loads(gpt_resp["choices"][0]["message"]["content"]) return r def __ai_annotation(self,english_text,meanings_data): """ AI词义标注 :param english_text: 英语文本 :param meanings_data: 词义数据包 :return: """ sys_question = """你是一个英语文本的词义标注师,工作是按要求对句子或文章进行词义id的标注。下面我将提供一篇英语文本以及一个包含单词ID和词义的数据包。 你的工作是对英语文本中的每个单词的原型,根据提供的词义数据包选择这个单词原型最合适的词义,并在单词后附上对应的词义ID。标注格式为:word[word_id]。 要求: 1.如果词义数据包中没有该单词或找不到合适的词义,请标注该单词在文中词义的中文翻译。示例:seismography[地震学] car[猫]。 2.如果是[缩写字符’和',连字符-、中文、标点符号、数字、百分比、序号A.B.C.D.或者日期],这些不是英语单词,不用标记,保持原样不变。\ 示例`It’s writer's 1999 2025 18:00 苹果 ____ A. B. C. D. e-mail Exhaust-fans`,这些都不标记。 3.标注每个英语单词,不是短语。错误示例:be good at[擅长]。正确示例:be[11] good[12] at[13]。 4.如果没有提供词义,则不标注。 5.任何缩写单词,不标注忽略,保持不变,如示例2。例如It’s,It's,It’ s,It' s。 回复格式要求如下: - 请按照用户原文顺序和格式返回处理后的文本。空格和换行\\n,不用改变,不要加减空格,与原文一致。 - 每个单词后面标注上其对应的词义ID,格式为:`word[word_id]`。 最终回复 示例1:If[1] a[2] dog[3] causes[4] a[5] cat[6] accident[7] and[8] gets[9] killed[10] 示例2:It’s cold[672] and[9] snowy[2286] . 请确保理解上述说明并准备好接收英语文本及词义数据包。""" user_question = "英语文本:\n" + english_text + meanings_data gpt_resp = get_annotation_gpt_pydantic(question=user_question,sys_prompt=sys_question,max_tokens=8000) result_annotation = self.__parse_gpt_resp(gpt_resp=gpt_resp) return result_annotation