This commit is contained in:
dingfeng.wong
2025-07-22 01:48:15 +08:00
parent c3c9664d59
commit eb47b6a22d
2 changed files with 406 additions and 121 deletions
+49 -3
View File
@@ -18,7 +18,7 @@ from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.syntax import Syntax from rich.syntax import Syntax
from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot, perform_ocr_with_annotation
app = typer.Typer( app = typer.Typer(
name="ocr-screenshot", name="ocr-screenshot",
@@ -46,6 +46,26 @@ def main(
verbose: bool = typer.Option( verbose: bool = typer.Option(
default=False, default=False,
help="Show verbose output" help="Show verbose output"
),
annotate: bool = typer.Option(
default=False,
help="Create an annotated version of the image showing detected text regions"
),
show_words: bool = typer.Option(
default=True,
help="Show word-level bounding boxes in annotation (default: True)"
),
show_lines: bool = typer.Option(
default=False,
help="Show line-level bounding boxes in annotation"
),
show_blocks: bool = typer.Option(
default=False,
help="Show block-level bounding boxes in annotation"
),
show_text: bool = typer.Option(
default=False,
help="Overlay detected text on the annotated image"
) )
): ):
"""Take a region screenshot, perform OCR, and copy result to clipboard.""" """Take a region screenshot, perform OCR, and copy result to clipboard."""
@@ -82,7 +102,7 @@ def main(
if verbose: if verbose:
console.print(f"[green]✓ Screenshot saved to: {screenshot_path}[/green]") console.print(f"[green]✓ Screenshot saved to: {screenshot_path}[/green]")
# Step 2: Perform OCR # Step 2: Perform OCR (with optional annotation)
with Progress( with Progress(
SpinnerColumn(), SpinnerColumn(),
TextColumn("[progress.description]{task.description}"), TextColumn("[progress.description]{task.description}"),
@@ -90,7 +110,30 @@ def main(
transient=True transient=True
) as progress: ) as progress:
task = progress.add_task("[bold cyan]🔍 Performing OCR...", total=None) task = progress.add_task("[bold cyan]🔍 Performing OCR...", total=None)
if annotate:
# Create annotation output path
annotation_path = None
if save_image:
base_name = screenshot_path.stem
annotation_path = output_dir / f"{base_name}_annotated.png"
extracted_text, annotated_image_path = perform_ocr_with_annotation(
str(screenshot_path),
lang,
create_annotated=True,
annotation_output_path=str(annotation_path) if annotation_path else None,
show_words=show_words,
show_lines=show_lines,
show_blocks=show_blocks,
show_text=show_text
)
if annotated_image_path and verbose:
console.print(f"[green]✓ Annotated image saved to: {annotated_image_path}[/green]")
else:
extracted_text = perform_ocr(str(screenshot_path), lang) extracted_text = perform_ocr(str(screenshot_path), lang)
progress.update(task, description="[green]✓ OCR complete") progress.update(task, description="[green]✓ OCR complete")
if not extracted_text: if not extracted_text:
@@ -117,7 +160,10 @@ def main(
raise typer.Exit(1) raise typer.Exit(1)
# Success message # Success message
console.print("\n[bold green]✅ Text extracted and copied to clipboard![/bold green]") success_msg = "\n[bold green]✅ Text extracted and copied to clipboard![/bold green]"
if annotate:
success_msg += "\n[bold blue]📝 Annotated image created showing detected text regions.[/bold blue]"
console.print(success_msg)
if verbose: if verbose:
console.print("\n[bold]Extracted text:[/bold]") console.print("\n[bold]Extracted text:[/bold]")
+256 -17
View File
@@ -7,9 +7,10 @@ Core functionality for taking screenshots, performing OCR using DocTR, and clipb
import os import os
import subprocess import subprocess
from typing import Optional, Tuple
import pyperclip import pyperclip
from PIL import Image from PIL import Image, ImageDraw, ImageFont
from doctr.io import DocumentFile from doctr.io import DocumentFile
from doctr.models import ocr_predictor from doctr.models import ocr_predictor
@@ -44,28 +45,16 @@ def take_region_screenshot(output_path: str) -> bool:
return False return False
def perform_ocr(image_path: str, lang: str = 'eng') -> str: def doc_result_to_formatted_text(result) -> str:
""" """
Perform OCR on the given image using DocTR. Convert a DocTR OCR result to formatted text while preserving layout and indentation.
Args: Args:
image_path: Path to the image file result: DocTR OCR result object containing pages with detected text
lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
Returns: Returns:
Extracted text from the image with preserved formatting Formatted text string with preserved indentation and structure
""" """
try:
# Load the OCR model with state-of-the-art PARSeq recognition
model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
# Load the document from the image file
doc = DocumentFile.from_images(image_path)
# Run OCR on the document
result = model(doc)
# Extract text while preserving formatting
extracted_text_blocks = [] extracted_text_blocks = []
for page in result.pages: for page in result.pages:
@@ -182,6 +171,31 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
return '\n'.join(cleaned_lines) return '\n'.join(cleaned_lines)
def perform_ocr(image_path: str, lang: str = 'eng') -> str:
"""
Perform OCR on the given image using DocTR.
Args:
image_path: Path to the image file
lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
Returns:
Extracted text from the image with preserved formatting
"""
try:
# Load the OCR model with state-of-the-art PARSeq recognition
model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
# Load the document from the image file
doc = DocumentFile.from_images(image_path)
# Run OCR on the document
result = model(doc)
# Extract and format the text
return doc_result_to_formatted_text(result)
except Exception as e: except Exception as e:
print(f"Error performing OCR: {e}") print(f"Error performing OCR: {e}")
return "" return ""
@@ -205,4 +219,229 @@ def copy_to_clipboard(text: str) -> bool:
return False return False
def annotate_image_with_ocr_results(
image_path: str,
result,
output_path: Optional[str] = None,
show_words: bool = True,
show_lines: bool = False,
show_blocks: bool = False,
show_text: bool = False,
word_color: Tuple[int, int, int, int] = (255, 0, 0, 128), # Red with transparency
line_color: Tuple[int, int, int, int] = (0, 255, 0, 128), # Green with transparency
block_color: Tuple[int, int, int, int] = (0, 0, 255, 128), # Blue with transparency
text_color: Tuple[int, int, int] = (255, 255, 255), # White text
box_width: int = 2
) -> str:
"""
Annotate an image with OCR detection results, showing bounding boxes around detected text.
Args:
image_path: Path to the original image
result: DocTR OCR result object
output_path: Optional path to save annotated image (if None, creates one based on input)
show_words: Whether to show word-level bounding boxes
show_lines: Whether to show line-level bounding boxes
show_blocks: Whether to show block-level bounding boxes
show_text: Whether to overlay detected text on the image
word_color: RGBA color for word bounding boxes
line_color: RGBA color for line bounding boxes
block_color: RGBA color for block bounding boxes
text_color: RGB color for text overlay
box_width: Width of bounding box lines
Returns:
Path to the annotated image file
"""
try:
# Load the original image
image = Image.open(image_path).convert('RGBA')
width, height = image.size
# Create a transparent overlay for drawing
overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
draw = ImageDraw.Draw(overlay)
# Try to load a font for text overlay
try:
font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 12)
except (OSError, IOError):
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
except (OSError, IOError):
font = ImageFont.load_default()
# Process each page
for page in result.pages:
# Draw blocks if requested
if show_blocks:
for block in page.blocks:
if block.lines and block.lines[0].words:
# Calculate block bounding box from all words in the block
all_points = []
for line in block.lines:
for word in line.words:
if hasattr(word, 'geometry'):
geometry = word.geometry
for point in geometry:
# Convert relative coordinates to absolute
abs_x = int(point[0] * width)
abs_y = int(point[1] * height)
all_points.append((abs_x, abs_y))
if all_points:
min_x = min(p[0] for p in all_points)
max_x = max(p[0] for p in all_points)
min_y = min(p[1] for p in all_points)
max_y = max(p[1] for p in all_points)
draw.rectangle(
[min_x, min_y, max_x, max_y],
outline=block_color,
width=box_width
)
# Draw lines if requested
if show_lines:
for block in page.blocks:
for line in block.lines:
if line.words:
# Calculate line bounding box from all words in the line
all_points = []
for word in line.words:
if hasattr(word, 'geometry'):
geometry = word.geometry
for point in geometry:
# Convert relative coordinates to absolute
abs_x = int(point[0] * width)
abs_y = int(point[1] * height)
all_points.append((abs_x, abs_y))
if all_points:
min_x = min(p[0] for p in all_points)
max_x = max(p[0] for p in all_points)
min_y = min(p[1] for p in all_points)
max_y = max(p[1] for p in all_points)
draw.rectangle(
[min_x, min_y, max_x, max_y],
outline=line_color,
width=box_width
)
# Draw words (most detailed level)
if show_words:
for block in page.blocks:
for line in block.lines:
for word in line.words:
if hasattr(word, 'geometry'):
geometry = word.geometry
if len(geometry) >= 4: # Should be a polygon with at least 4 points
# Convert relative coordinates to absolute
abs_points = []
for point in geometry:
abs_x = int(point[0] * width)
abs_y = int(point[1] * height)
abs_points.append((abs_x, abs_y))
# Draw the polygon outline
draw.polygon(abs_points, outline=word_color, width=box_width)
# Optionally overlay the detected text
if show_text and hasattr(word, 'value'):
# Position text at the top-left of the bounding box
min_x = min(p[0] for p in abs_points)
min_y = min(p[1] for p in abs_points)
# Draw text with black outline for better visibility
for dx in [-1, 0, 1]:
for dy in [-1, 0, 1]:
if dx != 0 or dy != 0:
draw.text(
(min_x + dx, min_y + dy),
word.value,
font=font,
fill=(0, 0, 0) # Black outline
)
# Draw the main text
draw.text(
(min_x, min_y),
word.value,
font=font,
fill=text_color
)
# Composite the overlay onto the original image
annotated = Image.alpha_composite(image, overlay)
# Convert back to RGB for saving
annotated = annotated.convert('RGB')
# Generate output path if not provided
if output_path is None:
base_path = os.path.splitext(image_path)[0]
output_path = f"{base_path}_annotated.png"
# Save the annotated image
annotated.save(output_path)
return output_path
except Exception as e:
print(f"Error annotating image: {e}")
return ""
def perform_ocr_with_annotation(
image_path: str,
lang: str = 'eng',
create_annotated: bool = False,
annotation_output_path: Optional[str] = None,
**annotation_kwargs
) -> Tuple[str, str]:
"""
Perform OCR and optionally create an annotated version of the image.
Args:
image_path: Path to the image file
lang: Language code for OCR (default: 'eng')
create_annotated: Whether to create an annotated image
annotation_output_path: Optional path for annotated image
**annotation_kwargs: Additional arguments for annotation function
Returns:
Tuple of (extracted_text, annotated_image_path)
annotated_image_path will be empty string if create_annotated is False
"""
try:
# Load the OCR model
model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
# Load the document from the image file
doc = DocumentFile.from_images(image_path)
# Run OCR on the document
result = model(doc)
# Extract and format the text
extracted_text = doc_result_to_formatted_text(result)
# Create annotated image if requested
annotated_path = ""
if create_annotated:
annotated_path = annotate_image_with_ocr_results(
image_path,
result,
annotation_output_path,
**annotation_kwargs
)
return extracted_text, annotated_path
except Exception as e:
print(f"Error performing OCR with annotation: {e}")
return "", ""