a
This commit is contained in:
@@ -53,7 +53,7 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
|
|||||||
lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
|
lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Extracted text from the image
|
Extracted text from the image with preserved formatting
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Load the OCR model with state-of-the-art PARSeq recognition
|
# Load the OCR model with state-of-the-art PARSeq recognition
|
||||||
@@ -65,15 +65,122 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
|
|||||||
# Run OCR on the document
|
# Run OCR on the document
|
||||||
result = model(doc)
|
result = model(doc)
|
||||||
|
|
||||||
# Extract text from all pages and blocks
|
# Extract text while preserving formatting
|
||||||
extracted_text = []
|
extracted_text_blocks = []
|
||||||
for page in result.pages:
|
|
||||||
for block in page.blocks:
|
|
||||||
for line in block.lines:
|
|
||||||
for word in line.words:
|
|
||||||
extracted_text.append(word.value)
|
|
||||||
|
|
||||||
return ' '.join(extracted_text).strip()
|
for page in result.pages:
|
||||||
|
# Sort blocks by vertical position (top to bottom)
|
||||||
|
blocks_with_positions = []
|
||||||
|
|
||||||
|
for block in page.blocks:
|
||||||
|
# Calculate block position - we'll use the first line's first word's position
|
||||||
|
block_y = float('inf')
|
||||||
|
block_x = float('inf')
|
||||||
|
|
||||||
|
if block.lines:
|
||||||
|
first_line = block.lines[0]
|
||||||
|
if first_line.words:
|
||||||
|
first_word = first_line.words[0]
|
||||||
|
# Get word geometry - DocTR uses relative coordinates (0-1)
|
||||||
|
if hasattr(first_word, 'geometry'):
|
||||||
|
# geometry is typically a polygon with corner points
|
||||||
|
geometry = first_word.geometry
|
||||||
|
if len(geometry) >= 2:
|
||||||
|
block_x = min(point[0] for point in geometry)
|
||||||
|
block_y = min(point[1] for point in geometry)
|
||||||
|
|
||||||
|
blocks_with_positions.append((block_y, block_x, block))
|
||||||
|
|
||||||
|
# Sort blocks by position (top to bottom, left to right)
|
||||||
|
blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
|
||||||
|
|
||||||
|
# Process each block
|
||||||
|
for _, _, block in blocks_with_positions:
|
||||||
|
block_lines = []
|
||||||
|
|
||||||
|
# Sort lines within block by vertical position
|
||||||
|
lines_with_positions = []
|
||||||
|
|
||||||
|
for line in block.lines:
|
||||||
|
line_y = float('inf')
|
||||||
|
line_x = float('inf')
|
||||||
|
|
||||||
|
if line.words:
|
||||||
|
first_word = line.words[0]
|
||||||
|
if hasattr(first_word, 'geometry'):
|
||||||
|
geometry = first_word.geometry
|
||||||
|
if len(geometry) >= 2:
|
||||||
|
line_x = min(point[0] for point in geometry)
|
||||||
|
line_y = min(point[1] for point in geometry)
|
||||||
|
|
||||||
|
lines_with_positions.append((line_y, line_x, line))
|
||||||
|
|
||||||
|
# Sort lines by position
|
||||||
|
lines_with_positions.sort(key=lambda x: (x[0], x[1]))
|
||||||
|
|
||||||
|
# Calculate base indentation from the leftmost line in the block
|
||||||
|
base_x = float('inf')
|
||||||
|
for _, line_x, _ in lines_with_positions:
|
||||||
|
if line_x < base_x:
|
||||||
|
base_x = line_x
|
||||||
|
|
||||||
|
# Process each line
|
||||||
|
for line_y, line_x, line in lines_with_positions:
|
||||||
|
# Extract words from this line
|
||||||
|
line_words = []
|
||||||
|
|
||||||
|
# Sort words within line by horizontal position
|
||||||
|
words_with_positions = []
|
||||||
|
for word in line.words:
|
||||||
|
word_x = float('inf')
|
||||||
|
if hasattr(word, 'geometry'):
|
||||||
|
geometry = word.geometry
|
||||||
|
if len(geometry) >= 2:
|
||||||
|
word_x = min(point[0] for point in geometry)
|
||||||
|
words_with_positions.append((word_x, word))
|
||||||
|
|
||||||
|
# Sort words by horizontal position
|
||||||
|
words_with_positions.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
# Extract word text
|
||||||
|
for _, word in words_with_positions:
|
||||||
|
line_words.append(word.value)
|
||||||
|
|
||||||
|
if line_words:
|
||||||
|
# Calculate relative indentation
|
||||||
|
if base_x != float('inf') and line_x != float('inf'):
|
||||||
|
# Convert relative position difference to approximate spaces
|
||||||
|
# This is a heuristic - adjust the multiplier (50) based on your needs
|
||||||
|
relative_indent = max(0, int((line_x - base_x) * 50))
|
||||||
|
indentation = ' ' * relative_indent
|
||||||
|
else:
|
||||||
|
indentation = ''
|
||||||
|
|
||||||
|
# Join words in the line with spaces
|
||||||
|
line_text = indentation + ' '.join(line_words)
|
||||||
|
block_lines.append(line_text)
|
||||||
|
|
||||||
|
# Join lines in the block with newlines
|
||||||
|
if block_lines:
|
||||||
|
block_text = '\n'.join(block_lines)
|
||||||
|
extracted_text_blocks.append(block_text)
|
||||||
|
|
||||||
|
# Join blocks with double newlines to separate paragraphs/sections
|
||||||
|
final_text = '\n\n'.join(extracted_text_blocks).strip()
|
||||||
|
|
||||||
|
# Clean up excessive whitespace while preserving intentional formatting
|
||||||
|
lines = final_text.split('\n')
|
||||||
|
cleaned_lines = []
|
||||||
|
for line in lines:
|
||||||
|
# Preserve leading spaces but clean up excessive internal spacing
|
||||||
|
leading_spaces = len(line) - len(line.lstrip())
|
||||||
|
cleaned_content = ' '.join(line.split())
|
||||||
|
if cleaned_content: # Only add non-empty lines
|
||||||
|
cleaned_lines.append(' ' * leading_spaces + cleaned_content)
|
||||||
|
else:
|
||||||
|
cleaned_lines.append('') # Preserve empty lines
|
||||||
|
|
||||||
|
return '\n'.join(cleaned_lines)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error performing OCR: {e}")
|
print(f"Error performing OCR: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user