import re try: import pytesseract from PIL import Image except ImportError: pytesseract = None def extract_page_number(file_path): """ Extracts page number from a scanned image or PDF. For simplicity, this version focus on images. """ if not pytesseract: print("pytesseract not installed. Skipping OCR.") return None try: # Load image image = Image.open(file_path) # Perform OCR text = pytesseract.image_to_string(image, lang='chi_sim+eng') # Look for page numbers (e.g., "第 1 页", "1 / 10", or just "1" at the bottom) # We'll use some regex patterns patterns = [ r'第\s*(\d+)\s*页', r'(\d+)\s*/\s*\d+', r'-\s*(\d+)\s*-', r'\n\s*(\d+)\s*\n' # Often single digits on a line at the end ] for pattern in patterns: matches = re.findall(pattern, text) if matches: # Usually the last match is the page number if it's at the bottom return int(matches[-1]) # If no pattern matches, look for any digits in the last few lines lines = text.strip().split('\n') if lines: last_lines = lines[-3:] # check last 3 lines for line in reversed(last_lines): digits = re.findall(r'\d+', line) if digits: return int(digits[-1]) return None except Exception as e: print(f"Error during OCR: {e}") return None