a
This commit is contained in:
+50
-4
@@ -18,7 +18,7 @@ from rich.panel import Panel
|
|||||||
from rich.progress import Progress, SpinnerColumn, TextColumn
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||||
from rich.syntax import Syntax
|
from rich.syntax import Syntax
|
||||||
|
|
||||||
from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot
|
from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot, perform_ocr_with_annotation
|
||||||
|
|
||||||
app = typer.Typer(
|
app = typer.Typer(
|
||||||
name="ocr-screenshot",
|
name="ocr-screenshot",
|
||||||
@@ -46,6 +46,26 @@ def main(
|
|||||||
verbose: bool = typer.Option(
|
verbose: bool = typer.Option(
|
||||||
default=False,
|
default=False,
|
||||||
help="Show verbose output"
|
help="Show verbose output"
|
||||||
|
),
|
||||||
|
annotate: bool = typer.Option(
|
||||||
|
default=False,
|
||||||
|
help="Create an annotated version of the image showing detected text regions"
|
||||||
|
),
|
||||||
|
show_words: bool = typer.Option(
|
||||||
|
default=True,
|
||||||
|
help="Show word-level bounding boxes in annotation (default: True)"
|
||||||
|
),
|
||||||
|
show_lines: bool = typer.Option(
|
||||||
|
default=False,
|
||||||
|
help="Show line-level bounding boxes in annotation"
|
||||||
|
),
|
||||||
|
show_blocks: bool = typer.Option(
|
||||||
|
default=False,
|
||||||
|
help="Show block-level bounding boxes in annotation"
|
||||||
|
),
|
||||||
|
show_text: bool = typer.Option(
|
||||||
|
default=False,
|
||||||
|
help="Overlay detected text on the annotated image"
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
"""Take a region screenshot, perform OCR, and copy result to clipboard."""
|
"""Take a region screenshot, perform OCR, and copy result to clipboard."""
|
||||||
@@ -82,7 +102,7 @@ def main(
|
|||||||
if verbose:
|
if verbose:
|
||||||
console.print(f"[green]✓ Screenshot saved to: {screenshot_path}[/green]")
|
console.print(f"[green]✓ Screenshot saved to: {screenshot_path}[/green]")
|
||||||
|
|
||||||
# Step 2: Perform OCR
|
# Step 2: Perform OCR (with optional annotation)
|
||||||
with Progress(
|
with Progress(
|
||||||
SpinnerColumn(),
|
SpinnerColumn(),
|
||||||
TextColumn("[progress.description]{task.description}"),
|
TextColumn("[progress.description]{task.description}"),
|
||||||
@@ -90,7 +110,30 @@ def main(
|
|||||||
transient=True
|
transient=True
|
||||||
) as progress:
|
) as progress:
|
||||||
task = progress.add_task("[bold cyan]🔍 Performing OCR...", total=None)
|
task = progress.add_task("[bold cyan]🔍 Performing OCR...", total=None)
|
||||||
extracted_text = perform_ocr(str(screenshot_path), lang)
|
|
||||||
|
if annotate:
|
||||||
|
# Create annotation output path
|
||||||
|
annotation_path = None
|
||||||
|
if save_image:
|
||||||
|
base_name = screenshot_path.stem
|
||||||
|
annotation_path = output_dir / f"{base_name}_annotated.png"
|
||||||
|
|
||||||
|
extracted_text, annotated_image_path = perform_ocr_with_annotation(
|
||||||
|
str(screenshot_path),
|
||||||
|
lang,
|
||||||
|
create_annotated=True,
|
||||||
|
annotation_output_path=str(annotation_path) if annotation_path else None,
|
||||||
|
show_words=show_words,
|
||||||
|
show_lines=show_lines,
|
||||||
|
show_blocks=show_blocks,
|
||||||
|
show_text=show_text
|
||||||
|
)
|
||||||
|
|
||||||
|
if annotated_image_path and verbose:
|
||||||
|
console.print(f"[green]✓ Annotated image saved to: {annotated_image_path}[/green]")
|
||||||
|
else:
|
||||||
|
extracted_text = perform_ocr(str(screenshot_path), lang)
|
||||||
|
|
||||||
progress.update(task, description="[green]✓ OCR complete")
|
progress.update(task, description="[green]✓ OCR complete")
|
||||||
|
|
||||||
if not extracted_text:
|
if not extracted_text:
|
||||||
@@ -117,7 +160,10 @@ def main(
|
|||||||
raise typer.Exit(1)
|
raise typer.Exit(1)
|
||||||
|
|
||||||
# Success message
|
# Success message
|
||||||
console.print("\n[bold green]✅ Text extracted and copied to clipboard![/bold green]")
|
success_msg = "\n[bold green]✅ Text extracted and copied to clipboard![/bold green]"
|
||||||
|
if annotate:
|
||||||
|
success_msg += "\n[bold blue]📝 Annotated image created showing detected text regions.[/bold blue]"
|
||||||
|
console.print(success_msg)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
console.print("\n[bold]Extracted text:[/bold]")
|
console.print("\n[bold]Extracted text:[/bold]")
|
||||||
|
|||||||
+356
-117
@@ -7,9 +7,10 @@ Core functionality for taking screenshots, performing OCR using DocTR, and clipb
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
import pyperclip
|
import pyperclip
|
||||||
from PIL import Image
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
from doctr.io import DocumentFile
|
from doctr.io import DocumentFile
|
||||||
from doctr.models import ocr_predictor
|
from doctr.models import ocr_predictor
|
||||||
|
|
||||||
@@ -44,6 +45,133 @@ def take_region_screenshot(output_path: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def doc_result_to_formatted_text(result) -> str:
|
||||||
|
"""
|
||||||
|
Convert a DocTR OCR result to formatted text while preserving layout and indentation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
result: DocTR OCR result object containing pages with detected text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted text string with preserved indentation and structure
|
||||||
|
"""
|
||||||
|
extracted_text_blocks = []
|
||||||
|
|
||||||
|
for page in result.pages:
|
||||||
|
# Sort blocks by vertical position (top to bottom)
|
||||||
|
blocks_with_positions = []
|
||||||
|
|
||||||
|
for block in page.blocks:
|
||||||
|
# Calculate block position - we'll use the first line's first word's position
|
||||||
|
block_y = float('inf')
|
||||||
|
block_x = float('inf')
|
||||||
|
|
||||||
|
if block.lines:
|
||||||
|
first_line = block.lines[0]
|
||||||
|
if first_line.words:
|
||||||
|
first_word = first_line.words[0]
|
||||||
|
# Get word geometry - DocTR uses relative coordinates (0-1)
|
||||||
|
if hasattr(first_word, 'geometry'):
|
||||||
|
# geometry is typically a polygon with corner points
|
||||||
|
geometry = first_word.geometry
|
||||||
|
if len(geometry) >= 2:
|
||||||
|
block_x = min(point[0] for point in geometry)
|
||||||
|
block_y = min(point[1] for point in geometry)
|
||||||
|
|
||||||
|
blocks_with_positions.append((block_y, block_x, block))
|
||||||
|
|
||||||
|
# Sort blocks by position (top to bottom, left to right)
|
||||||
|
blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
|
||||||
|
|
||||||
|
# Process each block
|
||||||
|
for _, _, block in blocks_with_positions:
|
||||||
|
block_lines = []
|
||||||
|
|
||||||
|
# Sort lines within block by vertical position
|
||||||
|
lines_with_positions = []
|
||||||
|
|
||||||
|
for line in block.lines:
|
||||||
|
line_y = float('inf')
|
||||||
|
line_x = float('inf')
|
||||||
|
|
||||||
|
if line.words:
|
||||||
|
first_word = line.words[0]
|
||||||
|
if hasattr(first_word, 'geometry'):
|
||||||
|
geometry = first_word.geometry
|
||||||
|
if len(geometry) >= 2:
|
||||||
|
line_x = min(point[0] for point in geometry)
|
||||||
|
line_y = min(point[1] for point in geometry)
|
||||||
|
|
||||||
|
lines_with_positions.append((line_y, line_x, line))
|
||||||
|
|
||||||
|
# Sort lines by position
|
||||||
|
lines_with_positions.sort(key=lambda x: (x[0], x[1]))
|
||||||
|
|
||||||
|
# Calculate base indentation from the leftmost line in the block
|
||||||
|
base_x = float('inf')
|
||||||
|
for _, line_x, _ in lines_with_positions:
|
||||||
|
if line_x < base_x:
|
||||||
|
base_x = line_x
|
||||||
|
|
||||||
|
# Process each line
|
||||||
|
for line_y, line_x, line in lines_with_positions:
|
||||||
|
# Extract words from this line
|
||||||
|
line_words = []
|
||||||
|
|
||||||
|
# Sort words within line by horizontal position
|
||||||
|
words_with_positions = []
|
||||||
|
for word in line.words:
|
||||||
|
word_x = float('inf')
|
||||||
|
if hasattr(word, 'geometry'):
|
||||||
|
geometry = word.geometry
|
||||||
|
if len(geometry) >= 2:
|
||||||
|
word_x = min(point[0] for point in geometry)
|
||||||
|
words_with_positions.append((word_x, word))
|
||||||
|
|
||||||
|
# Sort words by horizontal position
|
||||||
|
words_with_positions.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
# Extract word text
|
||||||
|
for _, word in words_with_positions:
|
||||||
|
line_words.append(word.value)
|
||||||
|
|
||||||
|
if line_words:
|
||||||
|
# Calculate relative indentation
|
||||||
|
if base_x != float('inf') and line_x != float('inf'):
|
||||||
|
# Convert relative position difference to approximate spaces
|
||||||
|
# This is a heuristic - adjust the multiplier (50) based on your needs
|
||||||
|
relative_indent = max(0, int((line_x - base_x) * 50))
|
||||||
|
indentation = ' ' * relative_indent
|
||||||
|
else:
|
||||||
|
indentation = ''
|
||||||
|
|
||||||
|
# Join words in the line with spaces
|
||||||
|
line_text = indentation + ' '.join(line_words)
|
||||||
|
block_lines.append(line_text)
|
||||||
|
|
||||||
|
# Join lines in the block with newlines
|
||||||
|
if block_lines:
|
||||||
|
block_text = '\n'.join(block_lines)
|
||||||
|
extracted_text_blocks.append(block_text)
|
||||||
|
|
||||||
|
# Join blocks with double newlines to separate paragraphs/sections
|
||||||
|
final_text = '\n\n'.join(extracted_text_blocks).strip()
|
||||||
|
|
||||||
|
# Clean up excessive whitespace while preserving intentional formatting
|
||||||
|
lines = final_text.split('\n')
|
||||||
|
cleaned_lines = []
|
||||||
|
for line in lines:
|
||||||
|
# Preserve leading spaces but clean up excessive internal spacing
|
||||||
|
leading_spaces = len(line) - len(line.lstrip())
|
||||||
|
cleaned_content = ' '.join(line.split())
|
||||||
|
if cleaned_content: # Only add non-empty lines
|
||||||
|
cleaned_lines.append(' ' * leading_spaces + cleaned_content)
|
||||||
|
else:
|
||||||
|
cleaned_lines.append('') # Preserve empty lines
|
||||||
|
|
||||||
|
return '\n'.join(cleaned_lines)
|
||||||
|
|
||||||
|
|
||||||
def perform_ocr(image_path: str, lang: str = 'eng') -> str:
|
def perform_ocr(image_path: str, lang: str = 'eng') -> str:
|
||||||
"""
|
"""
|
||||||
Perform OCR on the given image using DocTR.
|
Perform OCR on the given image using DocTR.
|
||||||
@@ -65,122 +193,8 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
|
|||||||
# Run OCR on the document
|
# Run OCR on the document
|
||||||
result = model(doc)
|
result = model(doc)
|
||||||
|
|
||||||
# Extract text while preserving formatting
|
# Extract and format the text
|
||||||
extracted_text_blocks = []
|
return doc_result_to_formatted_text(result)
|
||||||
|
|
||||||
for page in result.pages:
|
|
||||||
# Sort blocks by vertical position (top to bottom)
|
|
||||||
blocks_with_positions = []
|
|
||||||
|
|
||||||
for block in page.blocks:
|
|
||||||
# Calculate block position - we'll use the first line's first word's position
|
|
||||||
block_y = float('inf')
|
|
||||||
block_x = float('inf')
|
|
||||||
|
|
||||||
if block.lines:
|
|
||||||
first_line = block.lines[0]
|
|
||||||
if first_line.words:
|
|
||||||
first_word = first_line.words[0]
|
|
||||||
# Get word geometry - DocTR uses relative coordinates (0-1)
|
|
||||||
if hasattr(first_word, 'geometry'):
|
|
||||||
# geometry is typically a polygon with corner points
|
|
||||||
geometry = first_word.geometry
|
|
||||||
if len(geometry) >= 2:
|
|
||||||
block_x = min(point[0] for point in geometry)
|
|
||||||
block_y = min(point[1] for point in geometry)
|
|
||||||
|
|
||||||
blocks_with_positions.append((block_y, block_x, block))
|
|
||||||
|
|
||||||
# Sort blocks by position (top to bottom, left to right)
|
|
||||||
blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
|
|
||||||
|
|
||||||
# Process each block
|
|
||||||
for _, _, block in blocks_with_positions:
|
|
||||||
block_lines = []
|
|
||||||
|
|
||||||
# Sort lines within block by vertical position
|
|
||||||
lines_with_positions = []
|
|
||||||
|
|
||||||
for line in block.lines:
|
|
||||||
line_y = float('inf')
|
|
||||||
line_x = float('inf')
|
|
||||||
|
|
||||||
if line.words:
|
|
||||||
first_word = line.words[0]
|
|
||||||
if hasattr(first_word, 'geometry'):
|
|
||||||
geometry = first_word.geometry
|
|
||||||
if len(geometry) >= 2:
|
|
||||||
line_x = min(point[0] for point in geometry)
|
|
||||||
line_y = min(point[1] for point in geometry)
|
|
||||||
|
|
||||||
lines_with_positions.append((line_y, line_x, line))
|
|
||||||
|
|
||||||
# Sort lines by position
|
|
||||||
lines_with_positions.sort(key=lambda x: (x[0], x[1]))
|
|
||||||
|
|
||||||
# Calculate base indentation from the leftmost line in the block
|
|
||||||
base_x = float('inf')
|
|
||||||
for _, line_x, _ in lines_with_positions:
|
|
||||||
if line_x < base_x:
|
|
||||||
base_x = line_x
|
|
||||||
|
|
||||||
# Process each line
|
|
||||||
for line_y, line_x, line in lines_with_positions:
|
|
||||||
# Extract words from this line
|
|
||||||
line_words = []
|
|
||||||
|
|
||||||
# Sort words within line by horizontal position
|
|
||||||
words_with_positions = []
|
|
||||||
for word in line.words:
|
|
||||||
word_x = float('inf')
|
|
||||||
if hasattr(word, 'geometry'):
|
|
||||||
geometry = word.geometry
|
|
||||||
if len(geometry) >= 2:
|
|
||||||
word_x = min(point[0] for point in geometry)
|
|
||||||
words_with_positions.append((word_x, word))
|
|
||||||
|
|
||||||
# Sort words by horizontal position
|
|
||||||
words_with_positions.sort(key=lambda x: x[0])
|
|
||||||
|
|
||||||
# Extract word text
|
|
||||||
for _, word in words_with_positions:
|
|
||||||
line_words.append(word.value)
|
|
||||||
|
|
||||||
if line_words:
|
|
||||||
# Calculate relative indentation
|
|
||||||
if base_x != float('inf') and line_x != float('inf'):
|
|
||||||
# Convert relative position difference to approximate spaces
|
|
||||||
# This is a heuristic - adjust the multiplier (50) based on your needs
|
|
||||||
relative_indent = max(0, int((line_x - base_x) * 50))
|
|
||||||
indentation = ' ' * relative_indent
|
|
||||||
else:
|
|
||||||
indentation = ''
|
|
||||||
|
|
||||||
# Join words in the line with spaces
|
|
||||||
line_text = indentation + ' '.join(line_words)
|
|
||||||
block_lines.append(line_text)
|
|
||||||
|
|
||||||
# Join lines in the block with newlines
|
|
||||||
if block_lines:
|
|
||||||
block_text = '\n'.join(block_lines)
|
|
||||||
extracted_text_blocks.append(block_text)
|
|
||||||
|
|
||||||
# Join blocks with double newlines to separate paragraphs/sections
|
|
||||||
final_text = '\n\n'.join(extracted_text_blocks).strip()
|
|
||||||
|
|
||||||
# Clean up excessive whitespace while preserving intentional formatting
|
|
||||||
lines = final_text.split('\n')
|
|
||||||
cleaned_lines = []
|
|
||||||
for line in lines:
|
|
||||||
# Preserve leading spaces but clean up excessive internal spacing
|
|
||||||
leading_spaces = len(line) - len(line.lstrip())
|
|
||||||
cleaned_content = ' '.join(line.split())
|
|
||||||
if cleaned_content: # Only add non-empty lines
|
|
||||||
cleaned_lines.append(' ' * leading_spaces + cleaned_content)
|
|
||||||
else:
|
|
||||||
cleaned_lines.append('') # Preserve empty lines
|
|
||||||
|
|
||||||
return '\n'.join(cleaned_lines)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error performing OCR: {e}")
|
print(f"Error performing OCR: {e}")
|
||||||
@@ -205,4 +219,229 @@ def copy_to_clipboard(text: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def annotate_image_with_ocr_results(
|
||||||
|
image_path: str,
|
||||||
|
result,
|
||||||
|
output_path: Optional[str] = None,
|
||||||
|
show_words: bool = True,
|
||||||
|
show_lines: bool = False,
|
||||||
|
show_blocks: bool = False,
|
||||||
|
show_text: bool = False,
|
||||||
|
word_color: Tuple[int, int, int, int] = (255, 0, 0, 128), # Red with transparency
|
||||||
|
line_color: Tuple[int, int, int, int] = (0, 255, 0, 128), # Green with transparency
|
||||||
|
block_color: Tuple[int, int, int, int] = (0, 0, 255, 128), # Blue with transparency
|
||||||
|
text_color: Tuple[int, int, int] = (255, 255, 255), # White text
|
||||||
|
box_width: int = 2
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Annotate an image with OCR detection results, showing bounding boxes around detected text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_path: Path to the original image
|
||||||
|
result: DocTR OCR result object
|
||||||
|
output_path: Optional path to save annotated image (if None, creates one based on input)
|
||||||
|
show_words: Whether to show word-level bounding boxes
|
||||||
|
show_lines: Whether to show line-level bounding boxes
|
||||||
|
show_blocks: Whether to show block-level bounding boxes
|
||||||
|
show_text: Whether to overlay detected text on the image
|
||||||
|
word_color: RGBA color for word bounding boxes
|
||||||
|
line_color: RGBA color for line bounding boxes
|
||||||
|
block_color: RGBA color for block bounding boxes
|
||||||
|
text_color: RGB color for text overlay
|
||||||
|
box_width: Width of bounding box lines
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the annotated image file
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Load the original image
|
||||||
|
image = Image.open(image_path).convert('RGBA')
|
||||||
|
width, height = image.size
|
||||||
|
|
||||||
|
# Create a transparent overlay for drawing
|
||||||
|
overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
|
||||||
|
draw = ImageDraw.Draw(overlay)
|
||||||
|
|
||||||
|
# Try to load a font for text overlay
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 12)
|
||||||
|
except (OSError, IOError):
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
|
||||||
|
except (OSError, IOError):
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
for page in result.pages:
|
||||||
|
# Draw blocks if requested
|
||||||
|
if show_blocks:
|
||||||
|
for block in page.blocks:
|
||||||
|
if block.lines and block.lines[0].words:
|
||||||
|
# Calculate block bounding box from all words in the block
|
||||||
|
all_points = []
|
||||||
|
for line in block.lines:
|
||||||
|
for word in line.words:
|
||||||
|
if hasattr(word, 'geometry'):
|
||||||
|
geometry = word.geometry
|
||||||
|
for point in geometry:
|
||||||
|
# Convert relative coordinates to absolute
|
||||||
|
abs_x = int(point[0] * width)
|
||||||
|
abs_y = int(point[1] * height)
|
||||||
|
all_points.append((abs_x, abs_y))
|
||||||
|
|
||||||
|
if all_points:
|
||||||
|
min_x = min(p[0] for p in all_points)
|
||||||
|
max_x = max(p[0] for p in all_points)
|
||||||
|
min_y = min(p[1] for p in all_points)
|
||||||
|
max_y = max(p[1] for p in all_points)
|
||||||
|
|
||||||
|
draw.rectangle(
|
||||||
|
[min_x, min_y, max_x, max_y],
|
||||||
|
outline=block_color,
|
||||||
|
width=box_width
|
||||||
|
)
|
||||||
|
|
||||||
|
# Draw lines if requested
|
||||||
|
if show_lines:
|
||||||
|
for block in page.blocks:
|
||||||
|
for line in block.lines:
|
||||||
|
if line.words:
|
||||||
|
# Calculate line bounding box from all words in the line
|
||||||
|
all_points = []
|
||||||
|
for word in line.words:
|
||||||
|
if hasattr(word, 'geometry'):
|
||||||
|
geometry = word.geometry
|
||||||
|
for point in geometry:
|
||||||
|
# Convert relative coordinates to absolute
|
||||||
|
abs_x = int(point[0] * width)
|
||||||
|
abs_y = int(point[1] * height)
|
||||||
|
all_points.append((abs_x, abs_y))
|
||||||
|
|
||||||
|
if all_points:
|
||||||
|
min_x = min(p[0] for p in all_points)
|
||||||
|
max_x = max(p[0] for p in all_points)
|
||||||
|
min_y = min(p[1] for p in all_points)
|
||||||
|
max_y = max(p[1] for p in all_points)
|
||||||
|
|
||||||
|
draw.rectangle(
|
||||||
|
[min_x, min_y, max_x, max_y],
|
||||||
|
outline=line_color,
|
||||||
|
width=box_width
|
||||||
|
)
|
||||||
|
|
||||||
|
# Draw words (most detailed level)
|
||||||
|
if show_words:
|
||||||
|
for block in page.blocks:
|
||||||
|
for line in block.lines:
|
||||||
|
for word in line.words:
|
||||||
|
if hasattr(word, 'geometry'):
|
||||||
|
geometry = word.geometry
|
||||||
|
if len(geometry) >= 4: # Should be a polygon with at least 4 points
|
||||||
|
# Convert relative coordinates to absolute
|
||||||
|
abs_points = []
|
||||||
|
for point in geometry:
|
||||||
|
abs_x = int(point[0] * width)
|
||||||
|
abs_y = int(point[1] * height)
|
||||||
|
abs_points.append((abs_x, abs_y))
|
||||||
|
|
||||||
|
# Draw the polygon outline
|
||||||
|
draw.polygon(abs_points, outline=word_color, width=box_width)
|
||||||
|
|
||||||
|
# Optionally overlay the detected text
|
||||||
|
if show_text and hasattr(word, 'value'):
|
||||||
|
# Position text at the top-left of the bounding box
|
||||||
|
min_x = min(p[0] for p in abs_points)
|
||||||
|
min_y = min(p[1] for p in abs_points)
|
||||||
|
|
||||||
|
# Draw text with black outline for better visibility
|
||||||
|
for dx in [-1, 0, 1]:
|
||||||
|
for dy in [-1, 0, 1]:
|
||||||
|
if dx != 0 or dy != 0:
|
||||||
|
draw.text(
|
||||||
|
(min_x + dx, min_y + dy),
|
||||||
|
word.value,
|
||||||
|
font=font,
|
||||||
|
fill=(0, 0, 0) # Black outline
|
||||||
|
)
|
||||||
|
|
||||||
|
# Draw the main text
|
||||||
|
draw.text(
|
||||||
|
(min_x, min_y),
|
||||||
|
word.value,
|
||||||
|
font=font,
|
||||||
|
fill=text_color
|
||||||
|
)
|
||||||
|
|
||||||
|
# Composite the overlay onto the original image
|
||||||
|
annotated = Image.alpha_composite(image, overlay)
|
||||||
|
|
||||||
|
# Convert back to RGB for saving
|
||||||
|
annotated = annotated.convert('RGB')
|
||||||
|
|
||||||
|
# Generate output path if not provided
|
||||||
|
if output_path is None:
|
||||||
|
base_path = os.path.splitext(image_path)[0]
|
||||||
|
output_path = f"{base_path}_annotated.png"
|
||||||
|
|
||||||
|
# Save the annotated image
|
||||||
|
annotated.save(output_path)
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error annotating image: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def perform_ocr_with_annotation(
|
||||||
|
image_path: str,
|
||||||
|
lang: str = 'eng',
|
||||||
|
create_annotated: bool = False,
|
||||||
|
annotation_output_path: Optional[str] = None,
|
||||||
|
**annotation_kwargs
|
||||||
|
) -> Tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Perform OCR and optionally create an annotated version of the image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_path: Path to the image file
|
||||||
|
lang: Language code for OCR (default: 'eng')
|
||||||
|
create_annotated: Whether to create an annotated image
|
||||||
|
annotation_output_path: Optional path for annotated image
|
||||||
|
**annotation_kwargs: Additional arguments for annotation function
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (extracted_text, annotated_image_path)
|
||||||
|
annotated_image_path will be empty string if create_annotated is False
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Load the OCR model
|
||||||
|
model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
|
||||||
|
|
||||||
|
# Load the document from the image file
|
||||||
|
doc = DocumentFile.from_images(image_path)
|
||||||
|
|
||||||
|
# Run OCR on the document
|
||||||
|
result = model(doc)
|
||||||
|
|
||||||
|
# Extract and format the text
|
||||||
|
extracted_text = doc_result_to_formatted_text(result)
|
||||||
|
|
||||||
|
# Create annotated image if requested
|
||||||
|
annotated_path = ""
|
||||||
|
if create_annotated:
|
||||||
|
annotated_path = annotate_image_with_ocr_results(
|
||||||
|
image_path,
|
||||||
|
result,
|
||||||
|
annotation_output_path,
|
||||||
|
**annotation_kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return extracted_text, annotated_path
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error performing OCR with annotation: {e}")
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user