a

2025-07-22 01:48:15 +08:00
parent c3c9664d59
commit eb47b6a22d
2 changed files with 406 additions and 121 deletions
@@ -18,7 +18,7 @@ from rich.panel import Panel
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from rich.syntax import Syntax

-from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot
+from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot, perform_ocr_with_annotation

 app = typer.Typer(
    name="ocr-screenshot",
@@ -46,6 +46,26 @@ def main(
    verbose: bool = typer.Option(
        default=False,
        help="Show verbose output"
+    ),
+    annotate: bool = typer.Option(
+        default=False,
+        help="Create an annotated version of the image showing detected text regions"
+    ),
+    show_words: bool = typer.Option(
+        default=True,
+        help="Show word-level bounding boxes in annotation (default: True)"
+    ),
+    show_lines: bool = typer.Option(
+        default=False,
+        help="Show line-level bounding boxes in annotation"
+    ),
+    show_blocks: bool = typer.Option(
+        default=False,
+        help="Show block-level bounding boxes in annotation"
+    ),
+    show_text: bool = typer.Option(
+        default=False,
+        help="Overlay detected text on the annotated image"
    )
 ):
    """Take a region screenshot, perform OCR, and copy result to clipboard."""
@@ -82,7 +102,7 @@ def main(
        if verbose:
            console.print(f"[green]✓ Screenshot saved to: {screenshot_path}[/green]")
        
-        # Step 2: Perform OCR
+        # Step 2: Perform OCR (with optional annotation)
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
@@ -90,7 +110,30 @@ def main(
            transient=True
        ) as progress:
            task = progress.add_task("[bold cyan]🔍 Performing OCR...", total=None)
-            extracted_text = perform_ocr(str(screenshot_path), lang)
+            
+            if annotate:
+                # Create annotation output path
+                annotation_path = None
+                if save_image:
+                    base_name = screenshot_path.stem
+                    annotation_path = output_dir / f"{base_name}_annotated.png"
+                
+                extracted_text, annotated_image_path = perform_ocr_with_annotation(
+                    str(screenshot_path), 
+                    lang,
+                    create_annotated=True,
+                    annotation_output_path=str(annotation_path) if annotation_path else None,
+                    show_words=show_words,
+                    show_lines=show_lines,
+                    show_blocks=show_blocks,
+                    show_text=show_text
+                )
+                
+                if annotated_image_path and verbose:
+                    console.print(f"[green]✓ Annotated image saved to: {annotated_image_path}[/green]")
+            else:
+                extracted_text = perform_ocr(str(screenshot_path), lang)
+                
            progress.update(task, description="[green]✓ OCR complete")
        
        if not extracted_text:
@@ -117,7 +160,10 @@ def main(
                raise typer.Exit(1)
        
        # Success message
-        console.print("\n[bold green]✅ Text extracted and copied to clipboard![/bold green]")
+        success_msg = "\n[bold green]✅ Text extracted and copied to clipboard![/bold green]"
+        if annotate:
+            success_msg += "\n[bold blue]📝 Annotated image created showing detected text regions.[/bold blue]"
+        console.print(success_msg)
        
        if verbose:
            console.print("\n[bold]Extracted text:[/bold]")
@@ -7,9 +7,10 @@ Core functionality for taking screenshots, performing OCR using DocTR, and clipb

 import os
 import subprocess
+from typing import Optional, Tuple

 import pyperclip
-from PIL import Image
+from PIL import Image, ImageDraw, ImageFont
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor

@@ -44,6 +45,133 @@ def take_region_screenshot(output_path: str) -> bool:
        return False


+def doc_result_to_formatted_text(result) -> str:
+    """
+    Convert a DocTR OCR result to formatted text while preserving layout and indentation.
+    
+    Args:
+        result: DocTR OCR result object containing pages with detected text
+        
+    Returns:
+        Formatted text string with preserved indentation and structure
+    """
+    extracted_text_blocks = []
+    
+    for page in result.pages:
+        # Sort blocks by vertical position (top to bottom)
+        blocks_with_positions = []
+        
+        for block in page.blocks:
+            # Calculate block position - we'll use the first line's first word's position
+            block_y = float('inf')
+            block_x = float('inf')
+            
+            if block.lines:
+                first_line = block.lines[0]
+                if first_line.words:
+                    first_word = first_line.words[0]
+                    # Get word geometry - DocTR uses relative coordinates (0-1)
+                    if hasattr(first_word, 'geometry'):
+                        # geometry is typically a polygon with corner points
+                        geometry = first_word.geometry
+                        if len(geometry) >= 2:
+                            block_x = min(point[0] for point in geometry)
+                            block_y = min(point[1] for point in geometry)
+            
+            blocks_with_positions.append((block_y, block_x, block))
+        
+        # Sort blocks by position (top to bottom, left to right)
+        blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
+        
+        # Process each block
+        for _, _, block in blocks_with_positions:
+            block_lines = []
+            
+            # Sort lines within block by vertical position
+            lines_with_positions = []
+            
+            for line in block.lines:
+                line_y = float('inf')
+                line_x = float('inf')
+                
+                if line.words:
+                    first_word = line.words[0]
+                    if hasattr(first_word, 'geometry'):
+                        geometry = first_word.geometry
+                        if len(geometry) >= 2:
+                            line_x = min(point[0] for point in geometry)
+                            line_y = min(point[1] for point in geometry)
+                
+                lines_with_positions.append((line_y, line_x, line))
+            
+            # Sort lines by position
+            lines_with_positions.sort(key=lambda x: (x[0], x[1]))
+            
+            # Calculate base indentation from the leftmost line in the block
+            base_x = float('inf')
+            for _, line_x, _ in lines_with_positions:
+                if line_x < base_x:
+                    base_x = line_x
+            
+            # Process each line
+            for line_y, line_x, line in lines_with_positions:
+                # Extract words from this line
+                line_words = []
+                
+                # Sort words within line by horizontal position
+                words_with_positions = []
+                for word in line.words:
+                    word_x = float('inf')
+                    if hasattr(word, 'geometry'):
+                        geometry = word.geometry
+                        if len(geometry) >= 2:
+                            word_x = min(point[0] for point in geometry)
+                    words_with_positions.append((word_x, word))
+                
+                # Sort words by horizontal position
+                words_with_positions.sort(key=lambda x: x[0])
+                
+                # Extract word text
+                for _, word in words_with_positions:
+                    line_words.append(word.value)
+                
+                if line_words:
+                    # Calculate relative indentation
+                    if base_x != float('inf') and line_x != float('inf'):
+                        # Convert relative position difference to approximate spaces
+                        # This is a heuristic - adjust the multiplier (50) based on your needs
+                        relative_indent = max(0, int((line_x - base_x) * 50))
+                        indentation = ' ' * relative_indent
+                    else:
+                        indentation = ''
+                    
+                    # Join words in the line with spaces
+                    line_text = indentation + ' '.join(line_words)
+                    block_lines.append(line_text)
+            
+            # Join lines in the block with newlines
+            if block_lines:
+                block_text = '\n'.join(block_lines)
+                extracted_text_blocks.append(block_text)
+    
+    # Join blocks with double newlines to separate paragraphs/sections
+    final_text = '\n\n'.join(extracted_text_blocks).strip()
+    
+    # Clean up excessive whitespace while preserving intentional formatting
+    lines = final_text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        # Preserve leading spaces but clean up excessive internal spacing
+        leading_spaces = len(line) - len(line.lstrip())
+        cleaned_content = ' '.join(line.split())
+        if cleaned_content:  # Only add non-empty lines
+            cleaned_lines.append(' ' * leading_spaces + cleaned_content)
+        else:
+            cleaned_lines.append('')  # Preserve empty lines
+    
+    return '\n'.join(cleaned_lines)
+
+
 def perform_ocr(image_path: str, lang: str = 'eng') -> str:
    """
    Perform OCR on the given image using DocTR.
@@ -65,122 +193,8 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
        # Run OCR on the document
        result = model(doc)
        
-        # Extract text while preserving formatting
-        extracted_text_blocks = []
-        
-        for page in result.pages:
-            # Sort blocks by vertical position (top to bottom)
-            blocks_with_positions = []
-            
-            for block in page.blocks:
-                # Calculate block position - we'll use the first line's first word's position
-                block_y = float('inf')
-                block_x = float('inf')
-                
-                if block.lines:
-                    first_line = block.lines[0]
-                    if first_line.words:
-                        first_word = first_line.words[0]
-                        # Get word geometry - DocTR uses relative coordinates (0-1)
-                        if hasattr(first_word, 'geometry'):
-                            # geometry is typically a polygon with corner points
-                            geometry = first_word.geometry
-                            if len(geometry) >= 2:
-                                block_x = min(point[0] for point in geometry)
-                                block_y = min(point[1] for point in geometry)
-                
-                blocks_with_positions.append((block_y, block_x, block))
-            
-            # Sort blocks by position (top to bottom, left to right)
-            blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
-            
-            # Process each block
-            for _, _, block in blocks_with_positions:
-                block_lines = []
-                
-                # Sort lines within block by vertical position
-                lines_with_positions = []
-                
-                for line in block.lines:
-                    line_y = float('inf')
-                    line_x = float('inf')
-                    
-                    if line.words:
-                        first_word = line.words[0]
-                        if hasattr(first_word, 'geometry'):
-                            geometry = first_word.geometry
-                            if len(geometry) >= 2:
-                                line_x = min(point[0] for point in geometry)
-                                line_y = min(point[1] for point in geometry)
-                    
-                    lines_with_positions.append((line_y, line_x, line))
-                
-                # Sort lines by position
-                lines_with_positions.sort(key=lambda x: (x[0], x[1]))
-                
-                # Calculate base indentation from the leftmost line in the block
-                base_x = float('inf')
-                for _, line_x, _ in lines_with_positions:
-                    if line_x < base_x:
-                        base_x = line_x
-                
-                # Process each line
-                for line_y, line_x, line in lines_with_positions:
-                    # Extract words from this line
-                    line_words = []
-                    
-                    # Sort words within line by horizontal position
-                    words_with_positions = []
-                    for word in line.words:
-                        word_x = float('inf')
-                        if hasattr(word, 'geometry'):
-                            geometry = word.geometry
-                            if len(geometry) >= 2:
-                                word_x = min(point[0] for point in geometry)
-                        words_with_positions.append((word_x, word))
-                    
-                    # Sort words by horizontal position
-                    words_with_positions.sort(key=lambda x: x[0])
-                    
-                    # Extract word text
-                    for _, word in words_with_positions:
-                        line_words.append(word.value)
-                    
-                    if line_words:
-                        # Calculate relative indentation
-                        if base_x != float('inf') and line_x != float('inf'):
-                            # Convert relative position difference to approximate spaces
-                            # This is a heuristic - adjust the multiplier (50) based on your needs
-                            relative_indent = max(0, int((line_x - base_x) * 50))
-                            indentation = ' ' * relative_indent
-                        else:
-                            indentation = ''
-                        
-                        # Join words in the line with spaces
-                        line_text = indentation + ' '.join(line_words)
-                        block_lines.append(line_text)
-                
-                # Join lines in the block with newlines
-                if block_lines:
-                    block_text = '\n'.join(block_lines)
-                    extracted_text_blocks.append(block_text)
-        
-        # Join blocks with double newlines to separate paragraphs/sections
-        final_text = '\n\n'.join(extracted_text_blocks).strip()
-        
-        # Clean up excessive whitespace while preserving intentional formatting
-        lines = final_text.split('\n')
-        cleaned_lines = []
-        for line in lines:
-            # Preserve leading spaces but clean up excessive internal spacing
-            leading_spaces = len(line) - len(line.lstrip())
-            cleaned_content = ' '.join(line.split())
-            if cleaned_content:  # Only add non-empty lines
-                cleaned_lines.append(' ' * leading_spaces + cleaned_content)
-            else:
-                cleaned_lines.append('')  # Preserve empty lines
-        
-        return '\n'.join(cleaned_lines)
+        # Extract and format the text
+        return doc_result_to_formatted_text(result)
        
    except Exception as e:
        print(f"Error performing OCR: {e}")
@@ -205,4 +219,229 @@ def copy_to_clipboard(text: str) -> bool:
        return False


+def annotate_image_with_ocr_results(
+    image_path: str, 
+    result, 
+    output_path: Optional[str] = None,
+    show_words: bool = True,
+    show_lines: bool = False,
+    show_blocks: bool = False,
+    show_text: bool = False,
+    word_color: Tuple[int, int, int, int] = (255, 0, 0, 128),  # Red with transparency
+    line_color: Tuple[int, int, int, int] = (0, 255, 0, 128),  # Green with transparency
+    block_color: Tuple[int, int, int, int] = (0, 0, 255, 128),  # Blue with transparency
+    text_color: Tuple[int, int, int] = (255, 255, 255),  # White text
+    box_width: int = 2
+) -> str:
+    """
+    Annotate an image with OCR detection results, showing bounding boxes around detected text.
+    
+    Args:
+        image_path: Path to the original image
+        result: DocTR OCR result object
+        output_path: Optional path to save annotated image (if None, creates one based on input)
+        show_words: Whether to show word-level bounding boxes
+        show_lines: Whether to show line-level bounding boxes
+        show_blocks: Whether to show block-level bounding boxes
+        show_text: Whether to overlay detected text on the image
+        word_color: RGBA color for word bounding boxes
+        line_color: RGBA color for line bounding boxes
+        block_color: RGBA color for block bounding boxes
+        text_color: RGB color for text overlay
+        box_width: Width of bounding box lines
+        
+    Returns:
+        Path to the annotated image file
+    """
+    try:
+        # Load the original image
+        image = Image.open(image_path).convert('RGBA')
+        width, height = image.size
+        
+        # Create a transparent overlay for drawing
+        overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
+        draw = ImageDraw.Draw(overlay)
+        
+        # Try to load a font for text overlay
+        try:
+            font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 12)
+        except (OSError, IOError):
+            try:
+                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
+            except (OSError, IOError):
+                font = ImageFont.load_default()
+        
+        # Process each page
+        for page in result.pages:
+            # Draw blocks if requested
+            if show_blocks:
+                for block in page.blocks:
+                    if block.lines and block.lines[0].words:
+                        # Calculate block bounding box from all words in the block
+                        all_points = []
+                        for line in block.lines:
+                            for word in line.words:
+                                if hasattr(word, 'geometry'):
+                                    geometry = word.geometry
+                                    for point in geometry:
+                                        # Convert relative coordinates to absolute
+                                        abs_x = int(point[0] * width)
+                                        abs_y = int(point[1] * height)
+                                        all_points.append((abs_x, abs_y))
+                        
+                        if all_points:
+                            min_x = min(p[0] for p in all_points)
+                            max_x = max(p[0] for p in all_points)
+                            min_y = min(p[1] for p in all_points)
+                            max_y = max(p[1] for p in all_points)
+                            
+                            draw.rectangle(
+                                [min_x, min_y, max_x, max_y],
+                                outline=block_color,
+                                width=box_width
+                            )
+            
+            # Draw lines if requested
+            if show_lines:
+                for block in page.blocks:
+                    for line in block.lines:
+                        if line.words:
+                            # Calculate line bounding box from all words in the line
+                            all_points = []
+                            for word in line.words:
+                                if hasattr(word, 'geometry'):
+                                    geometry = word.geometry
+                                    for point in geometry:
+                                        # Convert relative coordinates to absolute
+                                        abs_x = int(point[0] * width)
+                                        abs_y = int(point[1] * height)
+                                        all_points.append((abs_x, abs_y))
+                            
+                            if all_points:
+                                min_x = min(p[0] for p in all_points)
+                                max_x = max(p[0] for p in all_points)
+                                min_y = min(p[1] for p in all_points)
+                                max_y = max(p[1] for p in all_points)
+                                
+                                draw.rectangle(
+                                    [min_x, min_y, max_x, max_y],
+                                    outline=line_color,
+                                    width=box_width
+                                )
+            
+            # Draw words (most detailed level)
+            if show_words:
+                for block in page.blocks:
+                    for line in block.lines:
+                        for word in line.words:
+                            if hasattr(word, 'geometry'):
+                                geometry = word.geometry
+                                if len(geometry) >= 4:  # Should be a polygon with at least 4 points
+                                    # Convert relative coordinates to absolute
+                                    abs_points = []
+                                    for point in geometry:
+                                        abs_x = int(point[0] * width)
+                                        abs_y = int(point[1] * height)
+                                        abs_points.append((abs_x, abs_y))
+                                    
+                                    # Draw the polygon outline
+                                    draw.polygon(abs_points, outline=word_color, width=box_width)
+                                    
+                                    # Optionally overlay the detected text
+                                    if show_text and hasattr(word, 'value'):
+                                        # Position text at the top-left of the bounding box
+                                        min_x = min(p[0] for p in abs_points)
+                                        min_y = min(p[1] for p in abs_points)
+                                        
+                                        # Draw text with black outline for better visibility
+                                        for dx in [-1, 0, 1]:
+                                            for dy in [-1, 0, 1]:
+                                                if dx != 0 or dy != 0:
+                                                    draw.text(
+                                                        (min_x + dx, min_y + dy),
+                                                        word.value,
+                                                        font=font,
+                                                        fill=(0, 0, 0)  # Black outline
+                                                    )
+                                        
+                                        # Draw the main text
+                                        draw.text(
+                                            (min_x, min_y),
+                                            word.value,
+                                            font=font,
+                                            fill=text_color
+                                        )
+        
+        # Composite the overlay onto the original image
+        annotated = Image.alpha_composite(image, overlay)
+        
+        # Convert back to RGB for saving
+        annotated = annotated.convert('RGB')
+        
+        # Generate output path if not provided
+        if output_path is None:
+            base_path = os.path.splitext(image_path)[0]
+            output_path = f"{base_path}_annotated.png"
+        
+        # Save the annotated image
+        annotated.save(output_path)
+        
+        return output_path
+        
+    except Exception as e:
+        print(f"Error annotating image: {e}")
+        return ""
+
+
+def perform_ocr_with_annotation(
+    image_path: str, 
+    lang: str = 'eng',
+    create_annotated: bool = False,
+    annotation_output_path: Optional[str] = None,
+    **annotation_kwargs
+) -> Tuple[str, str]:
+    """
+    Perform OCR and optionally create an annotated version of the image.
+    
+    Args:
+        image_path: Path to the image file
+        lang: Language code for OCR (default: 'eng')
+        create_annotated: Whether to create an annotated image
+        annotation_output_path: Optional path for annotated image
+        **annotation_kwargs: Additional arguments for annotation function
+        
+    Returns:
+        Tuple of (extracted_text, annotated_image_path)
+        annotated_image_path will be empty string if create_annotated is False
+    """
+    try:
+        # Load the OCR model
+        model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
+        
+        # Load the document from the image file
+        doc = DocumentFile.from_images(image_path)
+        
+        # Run OCR on the document
+        result = model(doc)
+        
+        # Extract and format the text
+        extracted_text = doc_result_to_formatted_text(result)
+        
+        # Create annotated image if requested
+        annotated_path = ""
+        if create_annotated:
+            annotated_path = annotate_image_with_ocr_results(
+                image_path, 
+                result, 
+                annotation_output_path,
+                **annotation_kwargs
+            )
+        
+        return extracted_text, annotated_path
+        
+    except Exception as e:
+        print(f"Error performing OCR with annotation: {e}")
+        return "", ""
+
+