linh
/
genealogy-app


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
							import re
try:
    import pytesseract
    from PIL import Image
except ImportError:
    pytesseract = None

def extract_page_number(file_path):
    """
    Extracts page number from a scanned image or PDF.
    For simplicity, this version focus on images.
    """
    if not pytesseract:
        print("pytesseract not installed. Skipping OCR.")
        return None
    
    try:
        # Load image
        image = Image.open(file_path)
        # Perform OCR
        text = pytesseract.image_to_string(image, lang='chi_sim+eng')
        
        # Look for page numbers (e.g., "第 1 页", "1 / 10", or just "1" at the bottom)
        # We'll use some regex patterns
        patterns = [
            r'第\s*(\d+)\s*页',
            r'(\d+)\s*/\s*\d+',
            r'-\s*(\d+)\s*-',
            r'\n\s*(\d+)\s*\n' # Often single digits on a line at the end
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, text)
            if matches:
                # Usually the last match is the page number if it's at the bottom
                return int(matches[-1])
        
        # If no pattern matches, look for any digits in the last few lines
        lines = text.strip().split('\n')
        if lines:
            last_lines = lines[-3:] # check last 3 lines
            for line in reversed(last_lines):
                digits = re.findall(r'\d+', line)
                if digits:
                    return int(digits[-1])
                    
        return None
    except Exception as e:
        print(f"Error during OCR: {e}")
        return None