a
This commit is contained in:
@@ -53,7 +53,7 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
|
||||
lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
|
||||
|
||||
Returns:
|
||||
Extracted text from the image
|
||||
Extracted text from the image with preserved formatting
|
||||
"""
|
||||
try:
|
||||
# Load the OCR model with state-of-the-art PARSeq recognition
|
||||
@@ -65,15 +65,122 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
|
||||
# Run OCR on the document
|
||||
result = model(doc)
|
||||
|
||||
# Extract text from all pages and blocks
|
||||
extracted_text = []
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
for word in line.words:
|
||||
extracted_text.append(word.value)
|
||||
# Extract text while preserving formatting
|
||||
extracted_text_blocks = []
|
||||
|
||||
return ' '.join(extracted_text).strip()
|
||||
for page in result.pages:
|
||||
# Sort blocks by vertical position (top to bottom)
|
||||
blocks_with_positions = []
|
||||
|
||||
for block in page.blocks:
|
||||
# Calculate block position - we'll use the first line's first word's position
|
||||
block_y = float('inf')
|
||||
block_x = float('inf')
|
||||
|
||||
if block.lines:
|
||||
first_line = block.lines[0]
|
||||
if first_line.words:
|
||||
first_word = first_line.words[0]
|
||||
# Get word geometry - DocTR uses relative coordinates (0-1)
|
||||
if hasattr(first_word, 'geometry'):
|
||||
# geometry is typically a polygon with corner points
|
||||
geometry = first_word.geometry
|
||||
if len(geometry) >= 2:
|
||||
block_x = min(point[0] for point in geometry)
|
||||
block_y = min(point[1] for point in geometry)
|
||||
|
||||
blocks_with_positions.append((block_y, block_x, block))
|
||||
|
||||
# Sort blocks by position (top to bottom, left to right)
|
||||
blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
|
||||
|
||||
# Process each block
|
||||
for _, _, block in blocks_with_positions:
|
||||
block_lines = []
|
||||
|
||||
# Sort lines within block by vertical position
|
||||
lines_with_positions = []
|
||||
|
||||
for line in block.lines:
|
||||
line_y = float('inf')
|
||||
line_x = float('inf')
|
||||
|
||||
if line.words:
|
||||
first_word = line.words[0]
|
||||
if hasattr(first_word, 'geometry'):
|
||||
geometry = first_word.geometry
|
||||
if len(geometry) >= 2:
|
||||
line_x = min(point[0] for point in geometry)
|
||||
line_y = min(point[1] for point in geometry)
|
||||
|
||||
lines_with_positions.append((line_y, line_x, line))
|
||||
|
||||
# Sort lines by position
|
||||
lines_with_positions.sort(key=lambda x: (x[0], x[1]))
|
||||
|
||||
# Calculate base indentation from the leftmost line in the block
|
||||
base_x = float('inf')
|
||||
for _, line_x, _ in lines_with_positions:
|
||||
if line_x < base_x:
|
||||
base_x = line_x
|
||||
|
||||
# Process each line
|
||||
for line_y, line_x, line in lines_with_positions:
|
||||
# Extract words from this line
|
||||
line_words = []
|
||||
|
||||
# Sort words within line by horizontal position
|
||||
words_with_positions = []
|
||||
for word in line.words:
|
||||
word_x = float('inf')
|
||||
if hasattr(word, 'geometry'):
|
||||
geometry = word.geometry
|
||||
if len(geometry) >= 2:
|
||||
word_x = min(point[0] for point in geometry)
|
||||
words_with_positions.append((word_x, word))
|
||||
|
||||
# Sort words by horizontal position
|
||||
words_with_positions.sort(key=lambda x: x[0])
|
||||
|
||||
# Extract word text
|
||||
for _, word in words_with_positions:
|
||||
line_words.append(word.value)
|
||||
|
||||
if line_words:
|
||||
# Calculate relative indentation
|
||||
if base_x != float('inf') and line_x != float('inf'):
|
||||
# Convert relative position difference to approximate spaces
|
||||
# This is a heuristic - adjust the multiplier (50) based on your needs
|
||||
relative_indent = max(0, int((line_x - base_x) * 50))
|
||||
indentation = ' ' * relative_indent
|
||||
else:
|
||||
indentation = ''
|
||||
|
||||
# Join words in the line with spaces
|
||||
line_text = indentation + ' '.join(line_words)
|
||||
block_lines.append(line_text)
|
||||
|
||||
# Join lines in the block with newlines
|
||||
if block_lines:
|
||||
block_text = '\n'.join(block_lines)
|
||||
extracted_text_blocks.append(block_text)
|
||||
|
||||
# Join blocks with double newlines to separate paragraphs/sections
|
||||
final_text = '\n\n'.join(extracted_text_blocks).strip()
|
||||
|
||||
# Clean up excessive whitespace while preserving intentional formatting
|
||||
lines = final_text.split('\n')
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
# Preserve leading spaces but clean up excessive internal spacing
|
||||
leading_spaces = len(line) - len(line.lstrip())
|
||||
cleaned_content = ' '.join(line.split())
|
||||
if cleaned_content: # Only add non-empty lines
|
||||
cleaned_lines.append(' ' * leading_spaces + cleaned_content)
|
||||
else:
|
||||
cleaned_lines.append('') # Preserve empty lines
|
||||
|
||||
return '\n'.join(cleaned_lines)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error performing OCR: {e}")
|
||||
|
||||
Reference in New Issue
Block a user