a

2025-07-22 01:48:15 +08:00
parent c3c9664d59
commit eb47b6a22d
2 changed files with 406 additions and 121 deletions
@@ -18,7 +18,7 @@ from rich.panel import Panel
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from rich.syntax import Syntax
-from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot
+from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot, perform_ocr_with_annotation
 app = typer.Typer(
    name="ocr-screenshot",
@@ -46,6 +46,26 @@ def main(
    verbose: bool = typer.Option(
        default=False,
        help="Show verbose output"
    ),
    annotate: bool = typer.Option(
        default=False,
        help="Create an annotated version of the image showing detected text regions"
    ),
    show_words: bool = typer.Option(
        default=True,
        help="Show word-level bounding boxes in annotation (default: True)"
    ),
    show_lines: bool = typer.Option(
        default=False,
        help="Show line-level bounding boxes in annotation"
    ),
    show_blocks: bool = typer.Option(
        default=False,
        help="Show block-level bounding boxes in annotation"
    ),
    show_text: bool = typer.Option(
        default=False,
        help="Overlay detected text on the annotated image"
    )
 ):
    """Take a region screenshot, perform OCR, and copy result to clipboard."""
@@ -82,7 +102,7 @@ def main(
        if verbose:
            console.print(f"[green]✓ Screenshot saved to: {screenshot_path}[/green]")
-        # Step 2: Perform OCR
+        # Step 2: Perform OCR (with optional annotation)
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
@@ -90,7 +110,30 @@ def main(
            transient=True
        ) as progress:
            task = progress.add_task("[bold cyan]🔍 Performing OCR...", total=None)
            if annotate:
                # Create annotation output path
                annotation_path = None
                if save_image:
                    base_name = screenshot_path.stem
                    annotation_path = output_dir / f"{base_name}_annotated.png"
                extracted_text, annotated_image_path = perform_ocr_with_annotation(
                    str(screenshot_path), 
                    lang,
                    create_annotated=True,
                    annotation_output_path=str(annotation_path) if annotation_path else None,
                    show_words=show_words,
                    show_lines=show_lines,
                    show_blocks=show_blocks,
                    show_text=show_text
                )
                if annotated_image_path and verbose:
                    console.print(f"[green]✓ Annotated image saved to: {annotated_image_path}[/green]")
            else:
                extracted_text = perform_ocr(str(screenshot_path), lang)
            progress.update(task, description="[green]✓ OCR complete")
        if not extracted_text:
@@ -117,7 +160,10 @@ def main(
                raise typer.Exit(1)
        # Success message
-        console.print("\n[bold green]✅ Text extracted and copied to clipboard![/bold green]")
+        success_msg = "\n[bold green]✅ Text extracted and copied to clipboard![/bold green]"
        if annotate:
            success_msg += "\n[bold blue]📝 Annotated image created showing detected text regions.[/bold blue]"
        console.print(success_msg)
        if verbose:
            console.print("\n[bold]Extracted text:[/bold]")
@@ -7,9 +7,10 @@ Core functionality for taking screenshots, performing OCR using DocTR, and clipb
 import os
 import subprocess
 from typing import Optional, Tuple
 import pyperclip
-from PIL import Image
+from PIL import Image, ImageDraw, ImageFont
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
@@ -44,28 +45,16 @@ def take_region_screenshot(output_path: str) -> bool:
        return False
-def perform_ocr(image_path: str, lang: str = 'eng') -> str:
+def doc_result_to_formatted_text(result) -> str:
    """
-    Perform OCR on the given image using DocTR.
+    Convert a DocTR OCR result to formatted text while preserving layout and indentation.
    Args:
-        image_path: Path to the image file
+        result: DocTR OCR result object containing pages with detected text
        lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
    Returns:
-        Extracted text from the image with preserved formatting
+        Formatted text string with preserved indentation and structure
    """
    try:
        # Load the OCR model with state-of-the-art PARSeq recognition
        model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
        # Load the document from the image file
        doc = DocumentFile.from_images(image_path)
        # Run OCR on the document
        result = model(doc)
        # Extract text while preserving formatting
    extracted_text_blocks = []
    for page in result.pages:
@@ -182,6 +171,31 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
    return '\n'.join(cleaned_lines)
 def perform_ocr(image_path: str, lang: str = 'eng') -> str:
    """
    Perform OCR on the given image using DocTR.
    Args:
        image_path: Path to the image file
        lang: Language code for OCR (default: 'eng', not used by DocTR but kept for compatibility)
    Returns:
        Extracted text from the image with preserved formatting
    """
    try:
        # Load the OCR model with state-of-the-art PARSeq recognition
        model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
        # Load the document from the image file
        doc = DocumentFile.from_images(image_path)
        # Run OCR on the document
        result = model(doc)
        # Extract and format the text
        return doc_result_to_formatted_text(result)
    except Exception as e:
        print(f"Error performing OCR: {e}")
        return ""
@@ -205,4 +219,229 @@ def copy_to_clipboard(text: str) -> bool:
        return False
 def annotate_image_with_ocr_results(
    image_path: str, 
    result, 
    output_path: Optional[str] = None,
    show_words: bool = True,
    show_lines: bool = False,
    show_blocks: bool = False,
    show_text: bool = False,
    word_color: Tuple[int, int, int, int] = (255, 0, 0, 128),  # Red with transparency
    line_color: Tuple[int, int, int, int] = (0, 255, 0, 128),  # Green with transparency
    block_color: Tuple[int, int, int, int] = (0, 0, 255, 128),  # Blue with transparency
    text_color: Tuple[int, int, int] = (255, 255, 255),  # White text
    box_width: int = 2
 ) -> str:
    """
    Annotate an image with OCR detection results, showing bounding boxes around detected text.
    Args:
        image_path: Path to the original image
        result: DocTR OCR result object
        output_path: Optional path to save annotated image (if None, creates one based on input)
        show_words: Whether to show word-level bounding boxes
        show_lines: Whether to show line-level bounding boxes
        show_blocks: Whether to show block-level bounding boxes
        show_text: Whether to overlay detected text on the image
        word_color: RGBA color for word bounding boxes
        line_color: RGBA color for line bounding boxes
        block_color: RGBA color for block bounding boxes
        text_color: RGB color for text overlay
        box_width: Width of bounding box lines
    Returns:
        Path to the annotated image file
    """
    try:
        # Load the original image
        image = Image.open(image_path).convert('RGBA')
        width, height = image.size
        # Create a transparent overlay for drawing
        overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
        draw = ImageDraw.Draw(overlay)
        # Try to load a font for text overlay
        try:
            font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 12)
        except (OSError, IOError):
            try:
                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
            except (OSError, IOError):
                font = ImageFont.load_default()
        # Process each page
        for page in result.pages:
            # Draw blocks if requested
            if show_blocks:
                for block in page.blocks:
                    if block.lines and block.lines[0].words:
                        # Calculate block bounding box from all words in the block
                        all_points = []
                        for line in block.lines:
                            for word in line.words:
                                if hasattr(word, 'geometry'):
                                    geometry = word.geometry
                                    for point in geometry:
                                        # Convert relative coordinates to absolute
                                        abs_x = int(point[0] * width)
                                        abs_y = int(point[1] * height)
                                        all_points.append((abs_x, abs_y))
                        if all_points:
                            min_x = min(p[0] for p in all_points)
                            max_x = max(p[0] for p in all_points)
                            min_y = min(p[1] for p in all_points)
                            max_y = max(p[1] for p in all_points)
                            draw.rectangle(
                                [min_x, min_y, max_x, max_y],
                                outline=block_color,
                                width=box_width
                            )
            # Draw lines if requested
            if show_lines:
                for block in page.blocks:
                    for line in block.lines:
                        if line.words:
                            # Calculate line bounding box from all words in the line
                            all_points = []
                            for word in line.words:
                                if hasattr(word, 'geometry'):
                                    geometry = word.geometry
                                    for point in geometry:
                                        # Convert relative coordinates to absolute
                                        abs_x = int(point[0] * width)
                                        abs_y = int(point[1] * height)
                                        all_points.append((abs_x, abs_y))
                            if all_points:
                                min_x = min(p[0] for p in all_points)
                                max_x = max(p[0] for p in all_points)
                                min_y = min(p[1] for p in all_points)
                                max_y = max(p[1] for p in all_points)
                                draw.rectangle(
                                    [min_x, min_y, max_x, max_y],
                                    outline=line_color,
                                    width=box_width
                                )
            # Draw words (most detailed level)
            if show_words:
                for block in page.blocks:
                    for line in block.lines:
                        for word in line.words:
                            if hasattr(word, 'geometry'):
                                geometry = word.geometry
                                if len(geometry) >= 4:  # Should be a polygon with at least 4 points
                                    # Convert relative coordinates to absolute
                                    abs_points = []
                                    for point in geometry:
                                        abs_x = int(point[0] * width)
                                        abs_y = int(point[1] * height)
                                        abs_points.append((abs_x, abs_y))
                                    # Draw the polygon outline
                                    draw.polygon(abs_points, outline=word_color, width=box_width)
                                    # Optionally overlay the detected text
                                    if show_text and hasattr(word, 'value'):
                                        # Position text at the top-left of the bounding box
                                        min_x = min(p[0] for p in abs_points)
                                        min_y = min(p[1] for p in abs_points)
                                        # Draw text with black outline for better visibility
                                        for dx in [-1, 0, 1]:
                                            for dy in [-1, 0, 1]:
                                                if dx != 0 or dy != 0:
                                                    draw.text(
                                                        (min_x + dx, min_y + dy),
                                                        word.value,
                                                        font=font,
                                                        fill=(0, 0, 0)  # Black outline
                                                    )
                                        # Draw the main text
                                        draw.text(
                                            (min_x, min_y),
                                            word.value,
                                            font=font,
                                            fill=text_color
                                        )
        # Composite the overlay onto the original image
        annotated = Image.alpha_composite(image, overlay)
        # Convert back to RGB for saving
        annotated = annotated.convert('RGB')
        # Generate output path if not provided
        if output_path is None:
            base_path = os.path.splitext(image_path)[0]
            output_path = f"{base_path}_annotated.png"
        # Save the annotated image
        annotated.save(output_path)
        return output_path
    except Exception as e:
        print(f"Error annotating image: {e}")
        return ""
 def perform_ocr_with_annotation(
    image_path: str, 
    lang: str = 'eng',
    create_annotated: bool = False,
    annotation_output_path: Optional[str] = None,
    **annotation_kwargs
 ) -> Tuple[str, str]:
    """
    Perform OCR and optionally create an annotated version of the image.
    Args:
        image_path: Path to the image file
        lang: Language code for OCR (default: 'eng')
        create_annotated: Whether to create an annotated image
        annotation_output_path: Optional path for annotated image
        **annotation_kwargs: Additional arguments for annotation function
    Returns:
        Tuple of (extracted_text, annotated_image_path)
        annotated_image_path will be empty string if create_annotated is False
    """
    try:
        # Load the OCR model
        model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
        # Load the document from the image file
        doc = DocumentFile.from_images(image_path)
        # Run OCR on the document
        result = model(doc)
        # Extract and format the text
        extracted_text = doc_result_to_formatted_text(result)
        # Create annotated image if requested
        annotated_path = ""
        if create_annotated:
            annotated_path = annotate_image_with_ocr_results(
                image_path, 
                result, 
                annotation_output_path,
                **annotation_kwargs
            )
        return extracted_text, annotated_path
    except Exception as e:
        print(f"Error performing OCR with annotation: {e}")
        return "", ""