| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- import re
- try:
- import pytesseract
- from PIL import Image
- except ImportError:
- pytesseract = None
- def extract_page_number(file_path):
- """
- Extracts page number from a scanned image or PDF.
- For simplicity, this version focus on images.
- """
- if not pytesseract:
- print("pytesseract not installed. Skipping OCR.")
- return None
-
- try:
- # Load image
- image = Image.open(file_path)
- # Perform OCR
- text = pytesseract.image_to_string(image, lang='chi_sim+eng')
-
- # Look for page numbers (e.g., "第 1 页", "1 / 10", or just "1" at the bottom)
- # We'll use some regex patterns
- patterns = [
- r'第\s*(\d+)\s*页',
- r'(\d+)\s*/\s*\d+',
- r'-\s*(\d+)\s*-',
- r'\n\s*(\d+)\s*\n' # Often single digits on a line at the end
- ]
-
- for pattern in patterns:
- matches = re.findall(pattern, text)
- if matches:
- # Usually the last match is the page number if it's at the bottom
- return int(matches[-1])
-
- # If no pattern matches, look for any digits in the last few lines
- lines = text.strip().split('\n')
- if lines:
- last_lines = lines[-3:] # check last 3 lines
- for line in reversed(last_lines):
- digits = re.findall(r'\d+', line)
- if digits:
- return int(digits[-1])
-
- return None
- except Exception as e:
- print(f"Error during OCR: {e}")
- return None
|