diff --git a/src/tooling/ocr_screenshot.py b/src/tooling/ocr_screenshot.py index 7a9eb22..76c82fd 100644 --- a/src/tooling/ocr_screenshot.py +++ b/src/tooling/ocr_screenshot.py @@ -53,7 +53,7 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str: lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility) Returns: - Extracted text from the image + Extracted text from the image with preserved formatting """ try: # Load the OCR model with state-of-the-art PARSeq recognition @@ -65,15 +65,122 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str: # Run OCR on the document result = model(doc) - # Extract text from all pages and blocks - extracted_text = [] - for page in result.pages: - for block in page.blocks: - for line in block.lines: - for word in line.words: - extracted_text.append(word.value) + # Extract text while preserving formatting + extracted_text_blocks = [] - return ' '.join(extracted_text).strip() + for page in result.pages: + # Sort blocks by vertical position (top to bottom) + blocks_with_positions = [] + + for block in page.blocks: + # Calculate block position - we'll use the first line's first word's position + block_y = float('inf') + block_x = float('inf') + + if block.lines: + first_line = block.lines[0] + if first_line.words: + first_word = first_line.words[0] + # Get word geometry - DocTR uses relative coordinates (0-1) + if hasattr(first_word, 'geometry'): + # geometry is typically a polygon with corner points + geometry = first_word.geometry + if len(geometry) >= 2: + block_x = min(point[0] for point in geometry) + block_y = min(point[1] for point in geometry) + + blocks_with_positions.append((block_y, block_x, block)) + + # Sort blocks by position (top to bottom, left to right) + blocks_with_positions.sort(key=lambda x: (x[0], x[1])) + + # Process each block + for _, _, block in blocks_with_positions: + block_lines = [] + + # Sort lines within block by vertical position + lines_with_positions = [] + + for line in block.lines: + line_y = float('inf') + line_x = float('inf') + + if line.words: + first_word = line.words[0] + if hasattr(first_word, 'geometry'): + geometry = first_word.geometry + if len(geometry) >= 2: + line_x = min(point[0] for point in geometry) + line_y = min(point[1] for point in geometry) + + lines_with_positions.append((line_y, line_x, line)) + + # Sort lines by position + lines_with_positions.sort(key=lambda x: (x[0], x[1])) + + # Calculate base indentation from the leftmost line in the block + base_x = float('inf') + for _, line_x, _ in lines_with_positions: + if line_x < base_x: + base_x = line_x + + # Process each line + for line_y, line_x, line in lines_with_positions: + # Extract words from this line + line_words = [] + + # Sort words within line by horizontal position + words_with_positions = [] + for word in line.words: + word_x = float('inf') + if hasattr(word, 'geometry'): + geometry = word.geometry + if len(geometry) >= 2: + word_x = min(point[0] for point in geometry) + words_with_positions.append((word_x, word)) + + # Sort words by horizontal position + words_with_positions.sort(key=lambda x: x[0]) + + # Extract word text + for _, word in words_with_positions: + line_words.append(word.value) + + if line_words: + # Calculate relative indentation + if base_x != float('inf') and line_x != float('inf'): + # Convert relative position difference to approximate spaces + # This is a heuristic - adjust the multiplier (50) based on your needs + relative_indent = max(0, int((line_x - base_x) * 50)) + indentation = ' ' * relative_indent + else: + indentation = '' + + # Join words in the line with spaces + line_text = indentation + ' '.join(line_words) + block_lines.append(line_text) + + # Join lines in the block with newlines + if block_lines: + block_text = '\n'.join(block_lines) + extracted_text_blocks.append(block_text) + + # Join blocks with double newlines to separate paragraphs/sections + final_text = '\n\n'.join(extracted_text_blocks).strip() + + # Clean up excessive whitespace while preserving intentional formatting + lines = final_text.split('\n') + cleaned_lines = [] + for line in lines: + # Preserve leading spaces but clean up excessive internal spacing + leading_spaces = len(line) - len(line.lstrip()) + cleaned_content = ' '.join(line.split()) + if cleaned_content: # Only add non-empty lines + cleaned_lines.append(' ' * leading_spaces + cleaned_content) + else: + cleaned_lines.append('') # Preserve empty lines + + return '\n'.join(cleaned_lines) except Exception as e: print(f"Error performing OCR: {e}")