2 days ago · b7cb76fa66
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,4 +28,4 @@ EXPOSE 8888
 
															 # --error-logfile -：错误日志输出到stdout
														
 
															 # --log-level info：日志级别，方便排查问题
														
 
															 # app:app：Flask应用入口（第一个app是app.py文件，第二个app是文件内的Flask实例）
														
 
															-CMD ["gunicorn", "-w", "2", "-b", "0.0.0.0:8888", "--timeout", "300", "--graceful-timeout", "300", "--worker-class", "gevent", "--worker-connections", "1000", "--access-logfile", "-", "--error-logfile", "-", "--log-level", "info", "app:app"]
														
 
															+CMD ["gunicorn", "-w", "1", "-b", "0.0.0.0:8888", "--timeout", "300", "--graceful-timeout", "300", "--worker-class", "gevent", "--worker-connections", "1000", "--access-logfile", "-", "--error-logfile", "-", "--log-level", "info", "app:app"]
														
--- a/app.py
+++ b/app.py
@@ -29,6 +29,8 @@ def check_duplicate():
 
															     if not question_data["stem"]:
														
 
															         return jsonify({"code": -1, "message": "stem is required"}), 400
														
 
															+    # 确保索引已加载（多进程下避免空索引）
														
 
															+    checker.ensure_index_loaded()
														
 
															     # 执行基于内容的查重
														
 
															     result = checker.check_duplicate_by_content(question_data)
														
@@ -62,8 +64,11 @@ def sync_index():
 
															     """手动触发全量同步接口"""
														
 
															     print("🔄 收到同步索引请求")
														
 
															     try:
														
 
															-        checker.sync_all_from_db()
														
 
															-        return jsonify({"code": 0, "result": "Sync completed"})
														
 
															+        checker.ensure_index_loaded()
														
 
															+        started = checker.sync_all_from_db()
														
 
															+        if started:
														
 
															+            return jsonify({"code": 0, "result": "Sync completed"})
														
 
															+        return jsonify({"code": 0, "result": "Sync already running"})
														
 
															     except Exception as e:
														
 
															         return jsonify({"code": -1, "message": str(e)}), 500
														
@@ -85,6 +90,7 @@ def confirm_repeat():
 
															         return jsonify({"code": -1, "message": "Missing questionId or isRepeat"}), 400
														
 
															     try:
														
 
															+        checker.ensure_index_loaded()
														
 
															         success = checker.confirm_repeat(int(question_id), int(is_repeat))
														
 
															         if success:
														
 
															             return jsonify({"code": 0, "result": "ok"})
														
@@ -104,6 +110,7 @@ def get_question_info():
 
															         return jsonify({"code": -1, "message": "Missing questionId"}), 400
														
 
															     try:
														
 
															+        checker.ensure_index_loaded()
														
 
															         result = checker.get_question_data(int(question_id))
														
 
															         return jsonify({
														
 
															             "code": 0,
														
@@ -114,6 +121,21 @@ def get_question_info():
 
															     except Exception as e:
														
 
															         return jsonify({"code": -1, "message": str(e)}), 500
														
 
															+@app.route('/api/index_info', methods=['GET'])
														
 
															+def get_index_info():
														
 
															+    """查看当前索引文件路径及条数"""
														
 
															+    checker.ensure_index_loaded()
														
 
															+    index_count = int(checker.index.ntotal) if checker.index else 0
														
 
															+    return jsonify({
														
 
															+        "code": 0,
														
 
															+        "result": {
														
 
															+            "index_path": checker.index_path,
														
 
															+            "metadata_path": checker.metadata_path,
														
 
															+            "index_count": index_count,
														
 
															+            "metadata_count": len(checker.metadata)
														
 
															+        }
														
 
															+    })
														
 
															+
														
 
															 if __name__ == '__main__':
														
 
															     # 启动服务，默认 5000 端口
														
 
															     app.run(host='0.0.0.0', port=8888, debug=False)
														
--- a/duplicate_checker.py
+++ b/duplicate_checker.py
@@ -5,6 +5,9 @@ import json
 
															 import os
														
 
															 import pickle
														
 
															 import re
														
 
															+import threading
														
 
															+import time
														
 
															+import tempfile
														
 
															 import numpy as np
														
 
															 import faiss
														
 
															 import pymysql
														
@@ -28,10 +31,19 @@ class QuestionDuplicateChecker:
 
															             api_key=OPENAI_API_KEY,
														
 
															             base_url=OPENAI_BASE_URL
														
 
															         )
														
 
															-        self.index_path = index_path
														
 
															-        self.metadata_path = metadata_path
														
 
															+        base_dir = os.path.dirname(os.path.abspath(__file__))
														
 
															+        # 统一使用绝对路径，避免不同工作目录导致索引文件混乱
														
 
															+        self.index_path = index_path if os.path.isabs(index_path) else os.path.join(base_dir, index_path)
														
 
															+        self.metadata_path = metadata_path if os.path.isabs(metadata_path) else os.path.join(base_dir, metadata_path)
														
 
															         self.index = None
														
 
															         self.metadata = [] # 存储题目ID和文本，以便回显
														
 
															+        self.sync_lock = threading.Lock()
														
 
															+        self.reload_lock = threading.Lock()
														
 
															+        self.index_mtime = None
														
 
															+        self.metadata_mtime = None
														
 
															+        self.sync_in_progress = False
														
 
															+        self.reload_failures = 0
														
 
															+        self.next_reload_time = 0.0
														
 
															         # 权重设置
														
 
															         self.weights = {
														
@@ -42,6 +54,12 @@ class QuestionDuplicateChecker:
 
															         }
														
 
															         # 维度调整：4部分拼接后的总维度为 3072 * 4 = 12288
														
 
															         self.dimension = 3072 * 4
														
 
															+        # 限制 FAISS/OpenMP 线程，避免小机器 CPU 过载
														
 
															+        faiss_threads = max(1, min(2, os.cpu_count() or 1))
														
 
															+        try:
														
 
															+            faiss.omp_set_num_threads(faiss_threads)
														
 
															+        except Exception:
														
 
															+            pass
														
 
															         self._load_index()
														
@@ -98,6 +116,7 @@ class QuestionDuplicateChecker:
 
															                     with open(self.metadata_path, 'rb') as f:
														
 
															                         self.metadata = pickle.load(f)
														
 
															                     print(f"✓ 已加载现有索引，包含 {len(self.metadata)} 道题目")
														
 
															+                    self._update_index_mtime()
														
 
															             except Exception as e:
														
 
															                 print(f"⚠️ 加载索引失败: {e}，将初始化新索引")
														
 
															                 self._init_new_index()
														
@@ -109,14 +128,105 @@ class QuestionDuplicateChecker:
 
															         # 使用内积索引，配合权重拼接向量
														
 
															         self.index = faiss.IndexFlatIP(self.dimension)
														
 
															         self.metadata = []
														
 
															+        self.index_mtime = None
														
 
															+        self.metadata_mtime = None
														
 
															+        self.reload_failures = 0
														
 
															+        self.next_reload_time = 0.0
														
 
															         print("✓ 已初始化新的FAISS索引")
														
 
															+    def _update_index_mtime(self):
														
 
															+        """更新索引文件的最后修改时间记录"""
														
 
															+        self.index_mtime = os.path.getmtime(self.index_path) if os.path.exists(self.index_path) else None
														
 
															+        self.metadata_mtime = os.path.getmtime(self.metadata_path) if os.path.exists(self.metadata_path) else None
														
 
															+
														
 
															+    def ensure_index_loaded(self, force: bool = False) -> bool:
														
 
															+        """确保索引已从磁盘加载（多进程下避免空索引常驻）"""
														
 
															+        # 读取失败后做退避，避免频繁重载导致磁盘抖动
														
 
															+        now = time.time()
														
 
															+        if not force and now < self.next_reload_time:
														
 
															+            return False
														
 
															+
														
 
															+        index_exists = os.path.exists(self.index_path) and os.path.exists(self.metadata_path)
														
 
															+        if not index_exists:
														
 
															+            return False
														
 
															+
														
 
															+        need_reload = force
														
 
															+        if not need_reload:
														
 
															+            # 空索引或元数据为空时，优先尝试从磁盘加载
														
 
															+            if self.index is None or self.index.ntotal == 0 or not self.metadata:
														
 
															+                need_reload = True
														
 
															+            else:
														
 
															+                # 文件有更新则重载
														
 
															+                current_index_mtime = os.path.getmtime(self.index_path)
														
 
															+                current_metadata_mtime = os.path.getmtime(self.metadata_path)
														
 
															+                if (self.index_mtime is None or self.metadata_mtime is None or
														
 
															+                        current_index_mtime > self.index_mtime or current_metadata_mtime > self.metadata_mtime):
														
 
															+                    need_reload = True
														
 
															+
														
 
															+        if not need_reload:
														
 
															+            return False
														
 
															+
														
 
															+        with self.reload_lock:
														
 
															+            # 同步进行中时，避免在索引尚未写完时反复重载
														
 
															+            if self.sync_in_progress and not force:
														
 
															+                return False
														
 
															+
														
 
															+            # 双重检查，避免重复加载
														
 
															+            if not force and self.index is not None and self.index.ntotal > 0 and self.metadata:
														
 
															+                current_index_mtime = os.path.getmtime(self.index_path)
														
 
															+                current_metadata_mtime = os.path.getmtime(self.metadata_path)
														
 
															+                if (self.index_mtime is not None and self.metadata_mtime is not None and
														
 
															+                        current_index_mtime <= self.index_mtime and current_metadata_mtime <= self.metadata_mtime):
														
 
															+                    return False
														
 
															+
														
 
															+            try:
														
 
															+                self.index = faiss.read_index(self.index_path)
														
 
															+                with open(self.metadata_path, 'rb') as f:
														
 
															+                    self.metadata = pickle.load(f)
														
 
															+                self._update_index_mtime()
														
 
															+                self.reload_failures = 0
														
 
															+                self.next_reload_time = 0.0
														
 
															+                print(f"↻ 已重新加载索引，包含 {len(self.metadata)} 道题目")
														
 
															+                return True
														
 
															+            except Exception as e:
														
 
															+                print(f"⚠️ 重新加载索引失败: {e}")
														
 
															+                # 指数退避，最多 60 秒
														
 
															+                self.reload_failures += 1
														
 
															+                backoff = min(60.0, 2 ** self.reload_failures)
														
 
															+                self.next_reload_time = time.time() + backoff
														
 
															+                return False
														
 
															+
														
 
															     def save_index(self):
														
 
															         """保存索引和元数据"""
														
 
															-        faiss.write_index(self.index, self.index_path)
														
 
															-        with open(self.metadata_path, 'wb') as f:
														
 
															-            pickle.dump(self.metadata, f)
														
 
															-        print(f"✓ 索引和元数据已保存到 {self.index_path}")
														
 
															+        # 先写入临时文件，再原子替换，避免读写冲突导致索引损坏
														
 
															+        index_dir = os.path.dirname(self.index_path)
														
 
															+        meta_dir = os.path.dirname(self.metadata_path)
														
 
															+        os.makedirs(index_dir, exist_ok=True)
														
 
															+        os.makedirs(meta_dir, exist_ok=True)
														
 
															+
														
 
															+        tmp_index_fd, tmp_index_path = tempfile.mkstemp(prefix=".questions_tem.index.", dir=index_dir)
														
 
															+        tmp_meta_fd, tmp_meta_path = tempfile.mkstemp(prefix=".questions_tem_metadata.pkl.", dir=meta_dir)
														
 
															+        try:
														
 
															+            os.close(tmp_index_fd)
														
 
															+            os.close(tmp_meta_fd)
														
 
															+
														
 
															+            faiss.write_index(self.index, tmp_index_path)
														
 
															+            with open(tmp_meta_path, 'wb') as f:
														
 
															+                pickle.dump(self.metadata, f)
														
 
															+                f.flush()
														
 
															+                os.fsync(f.fileno())
														
 
															+
														
 
															+            os.replace(tmp_index_path, self.index_path)
														
 
															+            os.replace(tmp_meta_path, self.metadata_path)
														
 
															+
														
 
															+            self._update_index_mtime()
														
 
															+            print(f"✓ 索引和元数据已保存到 {self.index_path}")
														
 
															+        finally:
														
 
															+            # 清理临时文件（若 replace 成功则不存在）
														
 
															+            if os.path.exists(tmp_index_path):
														
 
															+                os.remove(tmp_index_path)
														
 
															+            if os.path.exists(tmp_meta_path):
														
 
															+                os.remove(tmp_meta_path)
														
 
															     def get_weighted_embedding(self, question: Dict) -> np.ndarray:
														
 
															         """
														
@@ -356,6 +466,7 @@ class QuestionDuplicateChecker:
 
															         """
														
 
															         查重主逻辑 (未入库模式：不更新数据库，不自动入库)
														
 
															         """
														
 
															+        self.ensure_index_loaded()
														
 
															         # 1. 获取题目信息
														
 
															         question = self.fetch_question_from_db(question_id)
														
 
															         if not question:
														
@@ -430,6 +541,7 @@ class QuestionDuplicateChecker:
 
															         """
														
 
															         基于原始文本内容进行查重 (预检模式)
														
 
															         """
														
 
															+        self.ensure_index_loaded()
														
 
															         # 1. 获取加权拼接向量
														
 
															         mega_vector = self.get_weighted_embedding(question_data)
														
 
															         if mega_vector is None:
														
@@ -478,6 +590,7 @@ class QuestionDuplicateChecker:
 
															     def get_question_data(self, question_id: int) -> Dict:
														
 
															         """获取向量库中特定ID的数据及总数"""
														
 
															+        self.ensure_index_loaded()
														
 
															         total_count = self.index.ntotal if self.index else 0
														
 
															         target_metadata = None
														
@@ -503,8 +616,13 @@ class QuestionDuplicateChecker:
 
															         })
														
 
															         self.save_index()
														
 
															-    def sync_all_from_db(self, batch_size=50, max_workers=5):
														
 
															+    def sync_all_from_db(self, batch_size=50, max_workers=None):
														
 
															         """同步数据库中所有题目到索引 (支持加权模式 + 批量 + 多线程)"""
														
 
															+        if not self.sync_lock.acquire(blocking=False):
														
 
															+            print("⏳ 同步正在进行中，已忽略重复请求")
														
 
															+            return False
														
 
															+
														
 
															+        self.sync_in_progress = True
														
 
															         print("🔄 开始全量同步 (优化版 - 加权模式)...")
														
 
															         existing_ids = {m['id'] for m in self.metadata}
														
 
															         try:
														
@@ -529,7 +647,7 @@ class QuestionDuplicateChecker:
 
															             if total_new == 0:
														
 
															                 print("✅ 已经是最新状态，无需同步。")
														
 
															-                return
														
 
															+                return True
														
 
															             print(f"📊 数据库总计: {len(all_questions)}, 需同步新增: {total_new}")
														
@@ -540,8 +658,9 @@ class QuestionDuplicateChecker:
 
															                 mega_vectors = self.get_weighted_embeddings_batch(chunk)
														
 
															                 return chunk, mega_vectors
														
 
															-            # 使用线程池并发
														
 
															-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
														
 
															+            # 使用线程池并发（小机器限制并发，避免 CPU 过载）
														
 
															+            worker_count = max_workers or max(1, min(2, os.cpu_count() or 1))
														
 
															+            with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
														
 
															                 future_to_chunk = {executor.submit(process_chunk, chunk): chunk for chunk in chunks}
														
 
															                 count = 0
														
@@ -565,13 +684,18 @@ class QuestionDuplicateChecker:
 
															             self.save_index()
														
 
															             print(f"🎉 同步完成！当前索引总数: {len(self.metadata)}")
														
 
															+            return True
														
 
															         except Exception as e:
														
 
															             print(f"❌ 同步失败: {e}")
														
 
															             import traceback
														
 
															             traceback.print_exc()
														
 
															+            return False
														
 
															         finally:
														
 
															             if 'conn' in locals() and conn:
														
 
															                 conn.close()
														
 
															+            self.sync_in_progress = False
														
 
															+            if self.sync_lock.locked():
														
 
															+                self.sync_lock.release()
														
 
															 if __name__ == "__main__":
														
 
															     # 测试代码
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ faiss-cpu
 
															 pymysql
														
 
															 python-dotenv
														
 
															 cryptography
														
 
															+gevent>=24.10.1