This commit is contained in:
dingfeng.wong
2025-07-22 01:35:00 +08:00
parent a3779aa3f9
commit c3c9664d59
+116 -9
View File
@@ -53,7 +53,7 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
Returns:
Extracted text from the image
Extracted text from the image with preserved formatting
"""
try:
# Load the OCR model with state-of-the-art PARSeq recognition
@@ -65,15 +65,122 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
# Run OCR on the document
result = model(doc)
# Extract text from all pages and blocks
extracted_text = []
for page in result.pages:
for block in page.blocks:
for line in block.lines:
for word in line.words:
extracted_text.append(word.value)
# Extract text while preserving formatting
extracted_text_blocks = []
return ' '.join(extracted_text).strip()
for page in result.pages:
# Sort blocks by vertical position (top to bottom)
blocks_with_positions = []
for block in page.blocks:
# Calculate block position - we'll use the first line's first word's position
block_y = float('inf')
block_x = float('inf')
if block.lines:
first_line = block.lines[0]
if first_line.words:
first_word = first_line.words[0]
# Get word geometry - DocTR uses relative coordinates (0-1)
if hasattr(first_word, 'geometry'):
# geometry is typically a polygon with corner points
geometry = first_word.geometry
if len(geometry) >= 2:
block_x = min(point[0] for point in geometry)
block_y = min(point[1] for point in geometry)
blocks_with_positions.append((block_y, block_x, block))
# Sort blocks by position (top to bottom, left to right)
blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
# Process each block
for _, _, block in blocks_with_positions:
block_lines = []
# Sort lines within block by vertical position
lines_with_positions = []
for line in block.lines:
line_y = float('inf')
line_x = float('inf')
if line.words:
first_word = line.words[0]
if hasattr(first_word, 'geometry'):
geometry = first_word.geometry
if len(geometry) >= 2:
line_x = min(point[0] for point in geometry)
line_y = min(point[1] for point in geometry)
lines_with_positions.append((line_y, line_x, line))
# Sort lines by position
lines_with_positions.sort(key=lambda x: (x[0], x[1]))
# Calculate base indentation from the leftmost line in the block
base_x = float('inf')
for _, line_x, _ in lines_with_positions:
if line_x < base_x:
base_x = line_x
# Process each line
for line_y, line_x, line in lines_with_positions:
# Extract words from this line
line_words = []
# Sort words within line by horizontal position
words_with_positions = []
for word in line.words:
word_x = float('inf')
if hasattr(word, 'geometry'):
geometry = word.geometry
if len(geometry) >= 2:
word_x = min(point[0] for point in geometry)
words_with_positions.append((word_x, word))
# Sort words by horizontal position
words_with_positions.sort(key=lambda x: x[0])
# Extract word text
for _, word in words_with_positions:
line_words.append(word.value)
if line_words:
# Calculate relative indentation
if base_x != float('inf') and line_x != float('inf'):
# Convert relative position difference to approximate spaces
# This is a heuristic - adjust the multiplier (50) based on your needs
relative_indent = max(0, int((line_x - base_x) * 50))
indentation = ' ' * relative_indent
else:
indentation = ''
# Join words in the line with spaces
line_text = indentation + ' '.join(line_words)
block_lines.append(line_text)
# Join lines in the block with newlines
if block_lines:
block_text = '\n'.join(block_lines)
extracted_text_blocks.append(block_text)
# Join blocks with double newlines to separate paragraphs/sections
final_text = '\n\n'.join(extracted_text_blocks).strip()
# Clean up excessive whitespace while preserving intentional formatting
lines = final_text.split('\n')
cleaned_lines = []
for line in lines:
# Preserve leading spaces but clean up excessive internal spacing
leading_spaces = len(line) - len(line.lstrip())
cleaned_content = ' '.join(line.split())
if cleaned_content: # Only add non-empty lines
cleaned_lines.append(' ' * leading_spaces + cleaned_content)
else:
cleaned_lines.append('') # Preserve empty lines
return '\n'.join(cleaned_lines)
except Exception as e:
print(f"Error performing OCR: {e}")