xzd
/
ai_qback


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
							# -*- coding: utf-8 -*-


"""
1.对拿到的文章或句子，进行单词的切割；考虑，如何是大文本应该进行小段切割，防止巨量文本的传入;遇到巨量文本，切割后分多个问题，调用正常的运行流程；把custom_id的后缀加上123456标记
2.判断文章是否有缩写；将缩写还原，缩写标注第一个单词；例如it’s 标注it的词义id； 缩写获得的形式，写入mysql的缩写表
3.获取每个单词的词义数据包；

20250522
文章标注接口，分为2个接口，传文本同步返回，传文章对象异步返回。
下面这些情况，暂时不标注；1.缩写  2.连字符  3.春笋词义表内没有数据
注意：传入的文本将使用空格切分，连在一起的字符会被当做一个单词处理；
"""

import json
from random import randint

from cachetools import TTLCache
from openpyxl import load_workbook

from core.respone_format import *
from data.get_all_exchange_words import word_to_prototype
from gpt.chatgpt import get_annotation_gpt_pydantic
from tools.loglog import log_err_e, logger
from tools.thread_pool_manager import pool_executor


class Annotation:

    def __init__(self):

        self.all_task_data: dict[int, list] = {}

        self.all_task_result = TTLCache(maxsize=1000, ttl=3600)

        self.word_meaning_dict: dict[str, list[tuple[int, str, str]]] = {}

        self.prototype_words = set()
        self.change_prototype_dict = {}

        self.get_excel_meaning_data()
        self.get_excel_change_data()

    def submit_task(self, english_text, real_ip):
        task_id = randint(10000000, 99999999)
        logger.info(f"/article/annotation 生成id。task_id:{task_id},real_ip:{real_ip}")

        f = pool_executor.submit(self.main_annotation, task_id, english_text)
        r = f.result()

        return r

    def __run(self):

        for task_id, task_data in self.all_task_data.items():
            english_text, = task_data
            self.main_annotation(task_id, english_text)

    def main_annotation(self, task_id: int, english_text: str):

        split_words = english_text.split()

        meanings_data = self.query_meanings_data(split_words=split_words)

        result_annotation = self.__ai_annotation(english_text=english_text, meanings_data=meanings_data)

        self.all_task_result[task_id] = result_annotation

        return result_annotation

    async def query_result_by_taskid(self, task_id):

        if task_id in self.all_task_result:
            r = self.all_task_result[task_id]
            return resp_200(data=r)

        return resp_200(data={})

    def get_excel_meaning_data(self):
        """读取外部的春笋词义表，结构化到字典；单词为键，值[((词义id,中文词义))]"""
        spring_bamboo_meaning_path = "data/春笋词义表.xlsx"
        wb = load_workbook(spring_bamboo_meaning_path, read_only=True, data_only=True)
        ws = wb.active
        try:
            for index, row in enumerate(ws.values, start=1):
                if index == 1:
                    continue
                word = row[3]
                id_and_meaning = (row[0], word, row[2])
                if word not in self.word_meaning_dict:
                    self.word_meaning_dict[word] = [id_and_meaning]
                else:
                    self.word_meaning_dict[word].append(id_and_meaning)
        except Exception as e:
            log_err_e(e, msg="打开春笋词义表错误")
        finally:
            wb.close()

    def get_excel_change_data(self):
        """读取外部的春笋变形表"""
        spring_bamboo_change_path = "data/春笋单词对照变形.xlsx"
        wb = load_workbook(spring_bamboo_change_path, read_only=True, data_only=True)
        ws = wb.active
        try:
            for row in ws.values:
                word_prototype = row[0]
                word_change = row[1]
                self.prototype_words.add(word_prototype)
                self.change_prototype_dict[word_change] = word_prototype

        except Exception as e:
            log_err_e(e, msg="打开春笋变形表错误")
        finally:
            wb.close()

    def to_prototype_word(self, word):
        if word in self.prototype_words:
            w_prototype = word
        elif word.lower() in self.prototype_words:
            w_prototype = word.lower()
        elif word in self.change_prototype_dict:
            w_prototype = self.change_prototype_dict[word]
        else:
            w_prototype = word_to_prototype(word)
        return w_prototype

    def __query_meaning(self, word: str) -> str:
        """
        :param word: 单个单词
        :return: 加工好的词义文本
        """

        meaning_data1 = []
        if word in self.word_meaning_dict:
            meaning_data1.extend(self.word_meaning_dict[word])
            meaning_data_str = "".join([f"[{i[0]} {i[1]} {i[2]}]" for i in meaning_data1])
            return meaning_data_str
        elif word.lower() in self.word_meaning_dict:
            meaning_data1.extend(self.word_meaning_dict[word.lower()])
            meaning_data_str = "".join([f"[{i[0]} {i[1]} {i[2]}]" for i in meaning_data1])
            return meaning_data_str

        w_prototype = self.to_prototype_word(word)

        key_to_check = w_prototype if w_prototype in self.word_meaning_dict else w_prototype.lower()
        if key_to_check in self.word_meaning_dict:
            meaning_data = self.word_meaning_dict[key_to_check]
            meaning_data1.extend(meaning_data)
            meaning_data1 = list(set(meaning_data1))
            meaning_data_str = "".join([f"[{i[0]} {i[1]} {i[2]}]" for i in meaning_data1])
            return meaning_data_str

    def query_meanings_data(self, split_words: list):
        """
        查询所有单词的词义数据包
        :param split_words: 文章或句子被切割后的单词列表，连字符也拆开
        :return:
        """
        all_words_meaning_list = set()
        for word in split_words:
            result_query_meaning: str = self.__query_meaning(word)
            if result_query_meaning:
                all_words_meaning_list.add(f"【{word} {result_query_meaning}】")

        new_data_str = "\n词义数据包:\n" + "\n".join(all_words_meaning_list) + "\n\n"
        return new_data_str

    @staticmethod
    def __parse_gpt_resp(gpt_resp: dict):
        """
        解析ai-gpt的回复
        :param gpt_resp: GPT原始的回复
        :return:
        """

        r = json.loads(gpt_resp["choices"][0]["message"]["content"])
        return r

    def __ai_annotation(self, english_text, meanings_data):
        """
        AI词义标注
        :param english_text: 英语文本
        :param meanings_data: 词义数据包
        :return:
        """
        sys_question = """你是一个英语文本的词义标注师，工作是按要求对句子或文章进行词义id的标注。下面我将提供一篇英语文本以及一个包含单词ID和词义的数据包。
你的工作是对英语文本中的每个单词的原型，根据提供的词义数据包选择这个单词原型最合适的词义，并在单词后附上对应的词义ID。标注格式为：word[word_id]。
要求：
1.如果词义数据包中没有该单词或找不到合适的词义，请标注该单词在文中词义的中文翻译。示例：seismography[地震学] car[猫]。
2.如果是[连字符-、中文、标点符号、数字、百分比、序号A.B.C.D.或者日期]，这些不是英语单词，不用标记，保持原样不变。示例`1999 2025 18:00 苹果 ____ A. B. C. D. e-mail Exhaust-fans`，这些都不标记。
3.标注每个英语单词，不是短语。错误示例：be good at[擅长]。正确示例：be[11] good[12] at[13]。
4.如果没有提供词义，则不标注。

回复格式要求如下：
- 请按照用户原文顺序和格式返回处理后的文本。空格和换行\\n，不用改变，不要加减空格，与原文一致。
- 每个单词后面标注上其对应的词义ID，格式为：`word[word_id]`。

最终回复示例：If[1] a[2] dog[3] causes[4] a[5] cat[6] accident[7] and[8] gets[9] killed[10]
请确保理解上述说明并准备好接收英语文本及词义数据包。"""
        user_question = "英语文本:\n" + english_text + meanings_data
        gpt_resp = get_annotation_gpt_pydantic(question=user_question, sys_prompt=sys_question, max_tokens=8000)
        result_annotation = self.__parse_gpt_resp(gpt_resp=gpt_resp)
        return result_annotation