a

2025-07-22 01:35:00 +08:00
parent a3779aa3f9
commit c3c9664d59
1 changed files with 116 additions and 9 deletions
@@ -53,7 +53,7 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
        lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
        
    Returns:
-        Extracted text from the image
+        Extracted text from the image with preserved formatting
    """
    try:
        # Load the OCR model with state-of-the-art PARSeq recognition
@@ -65,15 +65,122 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
        # Run OCR on the document
        result = model(doc)
        
-        # Extract text from all pages and blocks
-        extracted_text = []
-        for page in result.pages:
-            for block in page.blocks:
-                for line in block.lines:
-                    for word in line.words:
-                        extracted_text.append(word.value)
+        # Extract text while preserving formatting
+        extracted_text_blocks = []
        
-        return ' '.join(extracted_text).strip()
+        for page in result.pages:
+            # Sort blocks by vertical position (top to bottom)
+            blocks_with_positions = []
+            
+            for block in page.blocks:
+                # Calculate block position - we'll use the first line's first word's position
+                block_y = float('inf')
+                block_x = float('inf')
+                
+                if block.lines:
+                    first_line = block.lines[0]
+                    if first_line.words:
+                        first_word = first_line.words[0]
+                        # Get word geometry - DocTR uses relative coordinates (0-1)
+                        if hasattr(first_word, 'geometry'):
+                            # geometry is typically a polygon with corner points
+                            geometry = first_word.geometry
+                            if len(geometry) >= 2:
+                                block_x = min(point[0] for point in geometry)
+                                block_y = min(point[1] for point in geometry)
+                
+                blocks_with_positions.append((block_y, block_x, block))
+            
+            # Sort blocks by position (top to bottom, left to right)
+            blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
+            
+            # Process each block
+            for _, _, block in blocks_with_positions:
+                block_lines = []
+                
+                # Sort lines within block by vertical position
+                lines_with_positions = []
+                
+                for line in block.lines:
+                    line_y = float('inf')
+                    line_x = float('inf')
+                    
+                    if line.words:
+                        first_word = line.words[0]
+                        if hasattr(first_word, 'geometry'):
+                            geometry = first_word.geometry
+                            if len(geometry) >= 2:
+                                line_x = min(point[0] for point in geometry)
+                                line_y = min(point[1] for point in geometry)
+                    
+                    lines_with_positions.append((line_y, line_x, line))
+                
+                # Sort lines by position
+                lines_with_positions.sort(key=lambda x: (x[0], x[1]))
+                
+                # Calculate base indentation from the leftmost line in the block
+                base_x = float('inf')
+                for _, line_x, _ in lines_with_positions:
+                    if line_x < base_x:
+                        base_x = line_x
+                
+                # Process each line
+                for line_y, line_x, line in lines_with_positions:
+                    # Extract words from this line
+                    line_words = []
+                    
+                    # Sort words within line by horizontal position
+                    words_with_positions = []
+                    for word in line.words:
+                        word_x = float('inf')
+                        if hasattr(word, 'geometry'):
+                            geometry = word.geometry
+                            if len(geometry) >= 2:
+                                word_x = min(point[0] for point in geometry)
+                        words_with_positions.append((word_x, word))
+                    
+                    # Sort words by horizontal position
+                    words_with_positions.sort(key=lambda x: x[0])
+                    
+                    # Extract word text
+                    for _, word in words_with_positions:
+                        line_words.append(word.value)
+                    
+                    if line_words:
+                        # Calculate relative indentation
+                        if base_x != float('inf') and line_x != float('inf'):
+                            # Convert relative position difference to approximate spaces
+                            # This is a heuristic - adjust the multiplier (50) based on your needs
+                            relative_indent = max(0, int((line_x - base_x) * 50))
+                            indentation = ' ' * relative_indent
+                        else:
+                            indentation = ''
+                        
+                        # Join words in the line with spaces
+                        line_text = indentation + ' '.join(line_words)
+                        block_lines.append(line_text)
+                
+                # Join lines in the block with newlines
+                if block_lines:
+                    block_text = '\n'.join(block_lines)
+                    extracted_text_blocks.append(block_text)
+        
+        # Join blocks with double newlines to separate paragraphs/sections
+        final_text = '\n\n'.join(extracted_text_blocks).strip()
+        
+        # Clean up excessive whitespace while preserving intentional formatting
+        lines = final_text.split('\n')
+        cleaned_lines = []
+        for line in lines:
+            # Preserve leading spaces but clean up excessive internal spacing
+            leading_spaces = len(line) - len(line.lstrip())
+            cleaned_content = ' '.join(line.split())
+            if cleaned_content:  # Only add non-empty lines
+                cleaned_lines.append(' ' * leading_spaces + cleaned_content)
+            else:
+                cleaned_lines.append('')  # Preserve empty lines
+        
+        return '\n'.join(cleaned_lines)
        
    except Exception as e:
        print(f"Error performing OCR: {e}")