From eb47b6a22d610adf41b98f899fb7c8e84ebf7c4b Mon Sep 17 00:00:00 2001 From: "dingfeng.wong" Date: Tue, 22 Jul 2025 01:48:15 +0800 Subject: [PATCH] a --- src/tooling/cli.py | 54 +++- src/tooling/ocr_screenshot.py | 473 +++++++++++++++++++++++++--------- 2 files changed, 406 insertions(+), 121 deletions(-) diff --git a/src/tooling/cli.py b/src/tooling/cli.py index 7ba1c8d..832e438 100644 --- a/src/tooling/cli.py +++ b/src/tooling/cli.py @@ -18,7 +18,7 @@ from rich.panel import Panel from rich.progress import Progress, SpinnerColumn, TextColumn from rich.syntax import Syntax -from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot +from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot, perform_ocr_with_annotation app = typer.Typer( name="ocr-screenshot", @@ -46,6 +46,26 @@ def main( verbose: bool = typer.Option( default=False, help="Show verbose output" + ), + annotate: bool = typer.Option( + default=False, + help="Create an annotated version of the image showing detected text regions" + ), + show_words: bool = typer.Option( + default=True, + help="Show word-level bounding boxes in annotation (default: True)" + ), + show_lines: bool = typer.Option( + default=False, + help="Show line-level bounding boxes in annotation" + ), + show_blocks: bool = typer.Option( + default=False, + help="Show block-level bounding boxes in annotation" + ), + show_text: bool = typer.Option( + default=False, + help="Overlay detected text on the annotated image" ) ): """Take a region screenshot, perform OCR, and copy result to clipboard.""" @@ -82,7 +102,7 @@ def main( if verbose: console.print(f"[green]✓ Screenshot saved to: {screenshot_path}[/green]") - # Step 2: Perform OCR + # Step 2: Perform OCR (with optional annotation) with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), @@ -90,7 +110,30 @@ def main( transient=True ) as progress: task = progress.add_task("[bold cyan]🔍 Performing OCR...", total=None) - extracted_text = perform_ocr(str(screenshot_path), lang) + + if annotate: + # Create annotation output path + annotation_path = None + if save_image: + base_name = screenshot_path.stem + annotation_path = output_dir / f"{base_name}_annotated.png" + + extracted_text, annotated_image_path = perform_ocr_with_annotation( + str(screenshot_path), + lang, + create_annotated=True, + annotation_output_path=str(annotation_path) if annotation_path else None, + show_words=show_words, + show_lines=show_lines, + show_blocks=show_blocks, + show_text=show_text + ) + + if annotated_image_path and verbose: + console.print(f"[green]✓ Annotated image saved to: {annotated_image_path}[/green]") + else: + extracted_text = perform_ocr(str(screenshot_path), lang) + progress.update(task, description="[green]✓ OCR complete") if not extracted_text: @@ -117,7 +160,10 @@ def main( raise typer.Exit(1) # Success message - console.print("\n[bold green]✅ Text extracted and copied to clipboard![/bold green]") + success_msg = "\n[bold green]✅ Text extracted and copied to clipboard![/bold green]" + if annotate: + success_msg += "\n[bold blue]📝 Annotated image created showing detected text regions.[/bold blue]" + console.print(success_msg) if verbose: console.print("\n[bold]Extracted text:[/bold]") diff --git a/src/tooling/ocr_screenshot.py b/src/tooling/ocr_screenshot.py index 76c82fd..8cd3de4 100644 --- a/src/tooling/ocr_screenshot.py +++ b/src/tooling/ocr_screenshot.py @@ -7,9 +7,10 @@ Core functionality for taking screenshots, performing OCR using DocTR, and clipb import os import subprocess +from typing import Optional, Tuple import pyperclip -from PIL import Image +from PIL import Image, ImageDraw, ImageFont from doctr.io import DocumentFile from doctr.models import ocr_predictor @@ -44,6 +45,133 @@ def take_region_screenshot(output_path: str) -> bool: return False +def doc_result_to_formatted_text(result) -> str: + """ + Convert a DocTR OCR result to formatted text while preserving layout and indentation. + + Args: + result: DocTR OCR result object containing pages with detected text + + Returns: + Formatted text string with preserved indentation and structure + """ + extracted_text_blocks = [] + + for page in result.pages: + # Sort blocks by vertical position (top to bottom) + blocks_with_positions = [] + + for block in page.blocks: + # Calculate block position - we'll use the first line's first word's position + block_y = float('inf') + block_x = float('inf') + + if block.lines: + first_line = block.lines[0] + if first_line.words: + first_word = first_line.words[0] + # Get word geometry - DocTR uses relative coordinates (0-1) + if hasattr(first_word, 'geometry'): + # geometry is typically a polygon with corner points + geometry = first_word.geometry + if len(geometry) >= 2: + block_x = min(point[0] for point in geometry) + block_y = min(point[1] for point in geometry) + + blocks_with_positions.append((block_y, block_x, block)) + + # Sort blocks by position (top to bottom, left to right) + blocks_with_positions.sort(key=lambda x: (x[0], x[1])) + + # Process each block + for _, _, block in blocks_with_positions: + block_lines = [] + + # Sort lines within block by vertical position + lines_with_positions = [] + + for line in block.lines: + line_y = float('inf') + line_x = float('inf') + + if line.words: + first_word = line.words[0] + if hasattr(first_word, 'geometry'): + geometry = first_word.geometry + if len(geometry) >= 2: + line_x = min(point[0] for point in geometry) + line_y = min(point[1] for point in geometry) + + lines_with_positions.append((line_y, line_x, line)) + + # Sort lines by position + lines_with_positions.sort(key=lambda x: (x[0], x[1])) + + # Calculate base indentation from the leftmost line in the block + base_x = float('inf') + for _, line_x, _ in lines_with_positions: + if line_x < base_x: + base_x = line_x + + # Process each line + for line_y, line_x, line in lines_with_positions: + # Extract words from this line + line_words = [] + + # Sort words within line by horizontal position + words_with_positions = [] + for word in line.words: + word_x = float('inf') + if hasattr(word, 'geometry'): + geometry = word.geometry + if len(geometry) >= 2: + word_x = min(point[0] for point in geometry) + words_with_positions.append((word_x, word)) + + # Sort words by horizontal position + words_with_positions.sort(key=lambda x: x[0]) + + # Extract word text + for _, word in words_with_positions: + line_words.append(word.value) + + if line_words: + # Calculate relative indentation + if base_x != float('inf') and line_x != float('inf'): + # Convert relative position difference to approximate spaces + # This is a heuristic - adjust the multiplier (50) based on your needs + relative_indent = max(0, int((line_x - base_x) * 50)) + indentation = ' ' * relative_indent + else: + indentation = '' + + # Join words in the line with spaces + line_text = indentation + ' '.join(line_words) + block_lines.append(line_text) + + # Join lines in the block with newlines + if block_lines: + block_text = '\n'.join(block_lines) + extracted_text_blocks.append(block_text) + + # Join blocks with double newlines to separate paragraphs/sections + final_text = '\n\n'.join(extracted_text_blocks).strip() + + # Clean up excessive whitespace while preserving intentional formatting + lines = final_text.split('\n') + cleaned_lines = [] + for line in lines: + # Preserve leading spaces but clean up excessive internal spacing + leading_spaces = len(line) - len(line.lstrip()) + cleaned_content = ' '.join(line.split()) + if cleaned_content: # Only add non-empty lines + cleaned_lines.append(' ' * leading_spaces + cleaned_content) + else: + cleaned_lines.append('') # Preserve empty lines + + return '\n'.join(cleaned_lines) + + def perform_ocr(image_path: str, lang: str = 'eng') -> str: """ Perform OCR on the given image using DocTR. @@ -65,122 +193,8 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str: # Run OCR on the document result = model(doc) - # Extract text while preserving formatting - extracted_text_blocks = [] - - for page in result.pages: - # Sort blocks by vertical position (top to bottom) - blocks_with_positions = [] - - for block in page.blocks: - # Calculate block position - we'll use the first line's first word's position - block_y = float('inf') - block_x = float('inf') - - if block.lines: - first_line = block.lines[0] - if first_line.words: - first_word = first_line.words[0] - # Get word geometry - DocTR uses relative coordinates (0-1) - if hasattr(first_word, 'geometry'): - # geometry is typically a polygon with corner points - geometry = first_word.geometry - if len(geometry) >= 2: - block_x = min(point[0] for point in geometry) - block_y = min(point[1] for point in geometry) - - blocks_with_positions.append((block_y, block_x, block)) - - # Sort blocks by position (top to bottom, left to right) - blocks_with_positions.sort(key=lambda x: (x[0], x[1])) - - # Process each block - for _, _, block in blocks_with_positions: - block_lines = [] - - # Sort lines within block by vertical position - lines_with_positions = [] - - for line in block.lines: - line_y = float('inf') - line_x = float('inf') - - if line.words: - first_word = line.words[0] - if hasattr(first_word, 'geometry'): - geometry = first_word.geometry - if len(geometry) >= 2: - line_x = min(point[0] for point in geometry) - line_y = min(point[1] for point in geometry) - - lines_with_positions.append((line_y, line_x, line)) - - # Sort lines by position - lines_with_positions.sort(key=lambda x: (x[0], x[1])) - - # Calculate base indentation from the leftmost line in the block - base_x = float('inf') - for _, line_x, _ in lines_with_positions: - if line_x < base_x: - base_x = line_x - - # Process each line - for line_y, line_x, line in lines_with_positions: - # Extract words from this line - line_words = [] - - # Sort words within line by horizontal position - words_with_positions = [] - for word in line.words: - word_x = float('inf') - if hasattr(word, 'geometry'): - geometry = word.geometry - if len(geometry) >= 2: - word_x = min(point[0] for point in geometry) - words_with_positions.append((word_x, word)) - - # Sort words by horizontal position - words_with_positions.sort(key=lambda x: x[0]) - - # Extract word text - for _, word in words_with_positions: - line_words.append(word.value) - - if line_words: - # Calculate relative indentation - if base_x != float('inf') and line_x != float('inf'): - # Convert relative position difference to approximate spaces - # This is a heuristic - adjust the multiplier (50) based on your needs - relative_indent = max(0, int((line_x - base_x) * 50)) - indentation = ' ' * relative_indent - else: - indentation = '' - - # Join words in the line with spaces - line_text = indentation + ' '.join(line_words) - block_lines.append(line_text) - - # Join lines in the block with newlines - if block_lines: - block_text = '\n'.join(block_lines) - extracted_text_blocks.append(block_text) - - # Join blocks with double newlines to separate paragraphs/sections - final_text = '\n\n'.join(extracted_text_blocks).strip() - - # Clean up excessive whitespace while preserving intentional formatting - lines = final_text.split('\n') - cleaned_lines = [] - for line in lines: - # Preserve leading spaces but clean up excessive internal spacing - leading_spaces = len(line) - len(line.lstrip()) - cleaned_content = ' '.join(line.split()) - if cleaned_content: # Only add non-empty lines - cleaned_lines.append(' ' * leading_spaces + cleaned_content) - else: - cleaned_lines.append('') # Preserve empty lines - - return '\n'.join(cleaned_lines) + # Extract and format the text + return doc_result_to_formatted_text(result) except Exception as e: print(f"Error performing OCR: {e}") @@ -205,4 +219,229 @@ def copy_to_clipboard(text: str) -> bool: return False +def annotate_image_with_ocr_results( + image_path: str, + result, + output_path: Optional[str] = None, + show_words: bool = True, + show_lines: bool = False, + show_blocks: bool = False, + show_text: bool = False, + word_color: Tuple[int, int, int, int] = (255, 0, 0, 128), # Red with transparency + line_color: Tuple[int, int, int, int] = (0, 255, 0, 128), # Green with transparency + block_color: Tuple[int, int, int, int] = (0, 0, 255, 128), # Blue with transparency + text_color: Tuple[int, int, int] = (255, 255, 255), # White text + box_width: int = 2 +) -> str: + """ + Annotate an image with OCR detection results, showing bounding boxes around detected text. + + Args: + image_path: Path to the original image + result: DocTR OCR result object + output_path: Optional path to save annotated image (if None, creates one based on input) + show_words: Whether to show word-level bounding boxes + show_lines: Whether to show line-level bounding boxes + show_blocks: Whether to show block-level bounding boxes + show_text: Whether to overlay detected text on the image + word_color: RGBA color for word bounding boxes + line_color: RGBA color for line bounding boxes + block_color: RGBA color for block bounding boxes + text_color: RGB color for text overlay + box_width: Width of bounding box lines + + Returns: + Path to the annotated image file + """ + try: + # Load the original image + image = Image.open(image_path).convert('RGBA') + width, height = image.size + + # Create a transparent overlay for drawing + overlay = Image.new('RGBA', image.size, (0, 0, 0, 0)) + draw = ImageDraw.Draw(overlay) + + # Try to load a font for text overlay + try: + font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 12) + except (OSError, IOError): + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12) + except (OSError, IOError): + font = ImageFont.load_default() + + # Process each page + for page in result.pages: + # Draw blocks if requested + if show_blocks: + for block in page.blocks: + if block.lines and block.lines[0].words: + # Calculate block bounding box from all words in the block + all_points = [] + for line in block.lines: + for word in line.words: + if hasattr(word, 'geometry'): + geometry = word.geometry + for point in geometry: + # Convert relative coordinates to absolute + abs_x = int(point[0] * width) + abs_y = int(point[1] * height) + all_points.append((abs_x, abs_y)) + + if all_points: + min_x = min(p[0] for p in all_points) + max_x = max(p[0] for p in all_points) + min_y = min(p[1] for p in all_points) + max_y = max(p[1] for p in all_points) + + draw.rectangle( + [min_x, min_y, max_x, max_y], + outline=block_color, + width=box_width + ) + + # Draw lines if requested + if show_lines: + for block in page.blocks: + for line in block.lines: + if line.words: + # Calculate line bounding box from all words in the line + all_points = [] + for word in line.words: + if hasattr(word, 'geometry'): + geometry = word.geometry + for point in geometry: + # Convert relative coordinates to absolute + abs_x = int(point[0] * width) + abs_y = int(point[1] * height) + all_points.append((abs_x, abs_y)) + + if all_points: + min_x = min(p[0] for p in all_points) + max_x = max(p[0] for p in all_points) + min_y = min(p[1] for p in all_points) + max_y = max(p[1] for p in all_points) + + draw.rectangle( + [min_x, min_y, max_x, max_y], + outline=line_color, + width=box_width + ) + + # Draw words (most detailed level) + if show_words: + for block in page.blocks: + for line in block.lines: + for word in line.words: + if hasattr(word, 'geometry'): + geometry = word.geometry + if len(geometry) >= 4: # Should be a polygon with at least 4 points + # Convert relative coordinates to absolute + abs_points = [] + for point in geometry: + abs_x = int(point[0] * width) + abs_y = int(point[1] * height) + abs_points.append((abs_x, abs_y)) + + # Draw the polygon outline + draw.polygon(abs_points, outline=word_color, width=box_width) + + # Optionally overlay the detected text + if show_text and hasattr(word, 'value'): + # Position text at the top-left of the bounding box + min_x = min(p[0] for p in abs_points) + min_y = min(p[1] for p in abs_points) + + # Draw text with black outline for better visibility + for dx in [-1, 0, 1]: + for dy in [-1, 0, 1]: + if dx != 0 or dy != 0: + draw.text( + (min_x + dx, min_y + dy), + word.value, + font=font, + fill=(0, 0, 0) # Black outline + ) + + # Draw the main text + draw.text( + (min_x, min_y), + word.value, + font=font, + fill=text_color + ) + + # Composite the overlay onto the original image + annotated = Image.alpha_composite(image, overlay) + + # Convert back to RGB for saving + annotated = annotated.convert('RGB') + + # Generate output path if not provided + if output_path is None: + base_path = os.path.splitext(image_path)[0] + output_path = f"{base_path}_annotated.png" + + # Save the annotated image + annotated.save(output_path) + + return output_path + + except Exception as e: + print(f"Error annotating image: {e}") + return "" + + +def perform_ocr_with_annotation( + image_path: str, + lang: str = 'eng', + create_annotated: bool = False, + annotation_output_path: Optional[str] = None, + **annotation_kwargs +) -> Tuple[str, str]: + """ + Perform OCR and optionally create an annotated version of the image. + + Args: + image_path: Path to the image file + lang: Language code for OCR (default: 'eng') + create_annotated: Whether to create an annotated image + annotation_output_path: Optional path for annotated image + **annotation_kwargs: Additional arguments for annotation function + + Returns: + Tuple of (extracted_text, annotated_image_path) + annotated_image_path will be empty string if create_annotated is False + """ + try: + # Load the OCR model + model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True) + + # Load the document from the image file + doc = DocumentFile.from_images(image_path) + + # Run OCR on the document + result = model(doc) + + # Extract and format the text + extracted_text = doc_result_to_formatted_text(result) + + # Create annotated image if requested + annotated_path = "" + if create_annotated: + annotated_path = annotate_image_with_ocr_results( + image_path, + result, + annotation_output_path, + **annotation_kwargs + ) + + return extracted_text, annotated_path + + except Exception as e: + print(f"Error performing OCR with annotation: {e}") + return "", "" + + \ No newline at end of file