ocr_utils.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import re
  2. try:
  3. import pytesseract
  4. from PIL import Image
  5. except ImportError:
  6. pytesseract = None
  7. def extract_page_number(file_path):
  8. """
  9. Extracts page number from a scanned image or PDF.
  10. For simplicity, this version focus on images.
  11. """
  12. if not pytesseract:
  13. print("pytesseract not installed. Skipping OCR.")
  14. return None
  15. try:
  16. # Load image
  17. image = Image.open(file_path)
  18. # Perform OCR
  19. text = pytesseract.image_to_string(image, lang='chi_sim+eng')
  20. # Look for page numbers (e.g., "第 1 页", "1 / 10", or just "1" at the bottom)
  21. # We'll use some regex patterns
  22. patterns = [
  23. r'第\s*(\d+)\s*页',
  24. r'(\d+)\s*/\s*\d+',
  25. r'-\s*(\d+)\s*-',
  26. r'\n\s*(\d+)\s*\n' # Often single digits on a line at the end
  27. ]
  28. for pattern in patterns:
  29. matches = re.findall(pattern, text)
  30. if matches:
  31. # Usually the last match is the page number if it's at the bottom
  32. return int(matches[-1])
  33. # If no pattern matches, look for any digits in the last few lines
  34. lines = text.strip().split('\n')
  35. if lines:
  36. last_lines = lines[-3:] # check last 3 lines
  37. for line in reversed(last_lines):
  38. digits = re.findall(r'\d+', line)
  39. if digits:
  40. return int(digits[-1])
  41. return None
  42. except Exception as e:
  43. print(f"Error during OCR: {e}")
  44. return None