a

2025-07-22 01:35:00 +08:00
parent a3779aa3f9
commit c3c9664d59
1 changed files with 116 additions and 9 deletions
@@ -53,7 +53,7 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
        lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
    Returns:
-        Extracted text from the image
+        Extracted text from the image with preserved formatting
    """
    try:
        # Load the OCR model with state-of-the-art PARSeq recognition
@@ -65,15 +65,122 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
        # Run OCR on the document
        result = model(doc)
-        # Extract text from all pages and blocks
+        # Extract text while preserving formatting
-        extracted_text = []
+        extracted_text_blocks = []
        for page in result.pages:
            for block in page.blocks:
                for line in block.lines:
                    for word in line.words:
                        extracted_text.append(word.value)
-        return ' '.join(extracted_text).strip()
+        for page in result.pages:
            # Sort blocks by vertical position (top to bottom)
            blocks_with_positions = []
            for block in page.blocks:
                # Calculate block position - we'll use the first line's first word's position
                block_y = float('inf')
                block_x = float('inf')
                if block.lines:
                    first_line = block.lines[0]
                    if first_line.words:
                        first_word = first_line.words[0]
                        # Get word geometry - DocTR uses relative coordinates (0-1)
                        if hasattr(first_word, 'geometry'):
                            # geometry is typically a polygon with corner points
                            geometry = first_word.geometry
                            if len(geometry) >= 2:
                                block_x = min(point[0] for point in geometry)
                                block_y = min(point[1] for point in geometry)
                blocks_with_positions.append((block_y, block_x, block))
            # Sort blocks by position (top to bottom, left to right)
            blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
            # Process each block
            for _, _, block in blocks_with_positions:
                block_lines = []
                # Sort lines within block by vertical position
                lines_with_positions = []
                for line in block.lines:
                    line_y = float('inf')
                    line_x = float('inf')
                    if line.words:
                        first_word = line.words[0]
                        if hasattr(first_word, 'geometry'):
                            geometry = first_word.geometry
                            if len(geometry) >= 2:
                                line_x = min(point[0] for point in geometry)
                                line_y = min(point[1] for point in geometry)
                    lines_with_positions.append((line_y, line_x, line))
                # Sort lines by position
                lines_with_positions.sort(key=lambda x: (x[0], x[1]))
                # Calculate base indentation from the leftmost line in the block
                base_x = float('inf')
                for _, line_x, _ in lines_with_positions:
                    if line_x < base_x:
                        base_x = line_x
                # Process each line
                for line_y, line_x, line in lines_with_positions:
                    # Extract words from this line
                    line_words = []
                    # Sort words within line by horizontal position
                    words_with_positions = []
                    for word in line.words:
                        word_x = float('inf')
                        if hasattr(word, 'geometry'):
                            geometry = word.geometry
                            if len(geometry) >= 2:
                                word_x = min(point[0] for point in geometry)
                        words_with_positions.append((word_x, word))
                    # Sort words by horizontal position
                    words_with_positions.sort(key=lambda x: x[0])
                    # Extract word text
                    for _, word in words_with_positions:
                        line_words.append(word.value)
                    if line_words:
                        # Calculate relative indentation
                        if base_x != float('inf') and line_x != float('inf'):
                            # Convert relative position difference to approximate spaces
                            # This is a heuristic - adjust the multiplier (50) based on your needs
                            relative_indent = max(0, int((line_x - base_x) * 50))
                            indentation = ' ' * relative_indent
                        else:
                            indentation = ''
                        # Join words in the line with spaces
                        line_text = indentation + ' '.join(line_words)
                        block_lines.append(line_text)
                # Join lines in the block with newlines
                if block_lines:
                    block_text = '\n'.join(block_lines)
                    extracted_text_blocks.append(block_text)
        # Join blocks with double newlines to separate paragraphs/sections
        final_text = '\n\n'.join(extracted_text_blocks).strip()
        # Clean up excessive whitespace while preserving intentional formatting
        lines = final_text.split('\n')
        cleaned_lines = []
        for line in lines:
            # Preserve leading spaces but clean up excessive internal spacing
            leading_spaces = len(line) - len(line.lstrip())
            cleaned_content = ' '.join(line.split())
            if cleaned_content:  # Only add non-empty lines
                cleaned_lines.append(' ' * leading_spaces + cleaned_content)
            else:
                cleaned_lines.append('')  # Preserve empty lines
        return '\n'.join(cleaned_lines)
    except Exception as e:
        print(f"Error performing OCR: {e}")