This commit is contained in:
dingfeng.wong
2025-07-22 01:48:15 +08:00
parent c3c9664d59
commit eb47b6a22d
2 changed files with 406 additions and 121 deletions
+50 -4
View File
@@ -18,7 +18,7 @@ from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.syntax import Syntax
from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot
from .ocr_screenshot import copy_to_clipboard, perform_ocr, take_region_screenshot, perform_ocr_with_annotation
app = typer.Typer(
name="ocr-screenshot",
@@ -46,6 +46,26 @@ def main(
verbose: bool = typer.Option(
default=False,
help="Show verbose output"
),
annotate: bool = typer.Option(
default=False,
help="Create an annotated version of the image showing detected text regions"
),
show_words: bool = typer.Option(
default=True,
help="Show word-level bounding boxes in annotation (default: True)"
),
show_lines: bool = typer.Option(
default=False,
help="Show line-level bounding boxes in annotation"
),
show_blocks: bool = typer.Option(
default=False,
help="Show block-level bounding boxes in annotation"
),
show_text: bool = typer.Option(
default=False,
help="Overlay detected text on the annotated image"
)
):
"""Take a region screenshot, perform OCR, and copy result to clipboard."""
@@ -82,7 +102,7 @@ def main(
if verbose:
console.print(f"[green]✓ Screenshot saved to: {screenshot_path}[/green]")
# Step 2: Perform OCR
# Step 2: Perform OCR (with optional annotation)
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
@@ -90,7 +110,30 @@ def main(
transient=True
) as progress:
task = progress.add_task("[bold cyan]🔍 Performing OCR...", total=None)
extracted_text = perform_ocr(str(screenshot_path), lang)
if annotate:
# Create annotation output path
annotation_path = None
if save_image:
base_name = screenshot_path.stem
annotation_path = output_dir / f"{base_name}_annotated.png"
extracted_text, annotated_image_path = perform_ocr_with_annotation(
str(screenshot_path),
lang,
create_annotated=True,
annotation_output_path=str(annotation_path) if annotation_path else None,
show_words=show_words,
show_lines=show_lines,
show_blocks=show_blocks,
show_text=show_text
)
if annotated_image_path and verbose:
console.print(f"[green]✓ Annotated image saved to: {annotated_image_path}[/green]")
else:
extracted_text = perform_ocr(str(screenshot_path), lang)
progress.update(task, description="[green]✓ OCR complete")
if not extracted_text:
@@ -117,7 +160,10 @@ def main(
raise typer.Exit(1)
# Success message
console.print("\n[bold green]✅ Text extracted and copied to clipboard![/bold green]")
success_msg = "\n[bold green]✅ Text extracted and copied to clipboard![/bold green]"
if annotate:
success_msg += "\n[bold blue]📝 Annotated image created showing detected text regions.[/bold blue]"
console.print(success_msg)
if verbose:
console.print("\n[bold]Extracted text:[/bold]")
+356 -117
View File
@@ -7,9 +7,10 @@ Core functionality for taking screenshots, performing OCR using DocTR, and clipb
import os
import subprocess
from typing import Optional, Tuple
import pyperclip
from PIL import Image
from PIL import Image, ImageDraw, ImageFont
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
@@ -44,6 +45,133 @@ def take_region_screenshot(output_path: str) -> bool:
return False
def doc_result_to_formatted_text(result) -> str:
"""
Convert a DocTR OCR result to formatted text while preserving layout and indentation.
Args:
result: DocTR OCR result object containing pages with detected text
Returns:
Formatted text string with preserved indentation and structure
"""
extracted_text_blocks = []
for page in result.pages:
# Sort blocks by vertical position (top to bottom)
blocks_with_positions = []
for block in page.blocks:
# Calculate block position - we'll use the first line's first word's position
block_y = float('inf')
block_x = float('inf')
if block.lines:
first_line = block.lines[0]
if first_line.words:
first_word = first_line.words[0]
# Get word geometry - DocTR uses relative coordinates (0-1)
if hasattr(first_word, 'geometry'):
# geometry is typically a polygon with corner points
geometry = first_word.geometry
if len(geometry) >= 2:
block_x = min(point[0] for point in geometry)
block_y = min(point[1] for point in geometry)
blocks_with_positions.append((block_y, block_x, block))
# Sort blocks by position (top to bottom, left to right)
blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
# Process each block
for _, _, block in blocks_with_positions:
block_lines = []
# Sort lines within block by vertical position
lines_with_positions = []
for line in block.lines:
line_y = float('inf')
line_x = float('inf')
if line.words:
first_word = line.words[0]
if hasattr(first_word, 'geometry'):
geometry = first_word.geometry
if len(geometry) >= 2:
line_x = min(point[0] for point in geometry)
line_y = min(point[1] for point in geometry)
lines_with_positions.append((line_y, line_x, line))
# Sort lines by position
lines_with_positions.sort(key=lambda x: (x[0], x[1]))
# Calculate base indentation from the leftmost line in the block
base_x = float('inf')
for _, line_x, _ in lines_with_positions:
if line_x < base_x:
base_x = line_x
# Process each line
for line_y, line_x, line in lines_with_positions:
# Extract words from this line
line_words = []
# Sort words within line by horizontal position
words_with_positions = []
for word in line.words:
word_x = float('inf')
if hasattr(word, 'geometry'):
geometry = word.geometry
if len(geometry) >= 2:
word_x = min(point[0] for point in geometry)
words_with_positions.append((word_x, word))
# Sort words by horizontal position
words_with_positions.sort(key=lambda x: x[0])
# Extract word text
for _, word in words_with_positions:
line_words.append(word.value)
if line_words:
# Calculate relative indentation
if base_x != float('inf') and line_x != float('inf'):
# Convert relative position difference to approximate spaces
# This is a heuristic - adjust the multiplier (50) based on your needs
relative_indent = max(0, int((line_x - base_x) * 50))
indentation = ' ' * relative_indent
else:
indentation = ''
# Join words in the line with spaces
line_text = indentation + ' '.join(line_words)
block_lines.append(line_text)
# Join lines in the block with newlines
if block_lines:
block_text = '\n'.join(block_lines)
extracted_text_blocks.append(block_text)
# Join blocks with double newlines to separate paragraphs/sections
final_text = '\n\n'.join(extracted_text_blocks).strip()
# Clean up excessive whitespace while preserving intentional formatting
lines = final_text.split('\n')
cleaned_lines = []
for line in lines:
# Preserve leading spaces but clean up excessive internal spacing
leading_spaces = len(line) - len(line.lstrip())
cleaned_content = ' '.join(line.split())
if cleaned_content: # Only add non-empty lines
cleaned_lines.append(' ' * leading_spaces + cleaned_content)
else:
cleaned_lines.append('') # Preserve empty lines
return '\n'.join(cleaned_lines)
def perform_ocr(image_path: str, lang: str = 'eng') -> str:
"""
Perform OCR on the given image using DocTR.
@@ -65,122 +193,8 @@ def perform_ocr(image_path: str, lang: str = 'eng') -> str:
# Run OCR on the document
result = model(doc)
# Extract text while preserving formatting
extracted_text_blocks = []
for page in result.pages:
# Sort blocks by vertical position (top to bottom)
blocks_with_positions = []
for block in page.blocks:
# Calculate block position - we'll use the first line's first word's position
block_y = float('inf')
block_x = float('inf')
if block.lines:
first_line = block.lines[0]
if first_line.words:
first_word = first_line.words[0]
# Get word geometry - DocTR uses relative coordinates (0-1)
if hasattr(first_word, 'geometry'):
# geometry is typically a polygon with corner points
geometry = first_word.geometry
if len(geometry) >= 2:
block_x = min(point[0] for point in geometry)
block_y = min(point[1] for point in geometry)
blocks_with_positions.append((block_y, block_x, block))
# Sort blocks by position (top to bottom, left to right)
blocks_with_positions.sort(key=lambda x: (x[0], x[1]))
# Process each block
for _, _, block in blocks_with_positions:
block_lines = []
# Sort lines within block by vertical position
lines_with_positions = []
for line in block.lines:
line_y = float('inf')
line_x = float('inf')
if line.words:
first_word = line.words[0]
if hasattr(first_word, 'geometry'):
geometry = first_word.geometry
if len(geometry) >= 2:
line_x = min(point[0] for point in geometry)
line_y = min(point[1] for point in geometry)
lines_with_positions.append((line_y, line_x, line))
# Sort lines by position
lines_with_positions.sort(key=lambda x: (x[0], x[1]))
# Calculate base indentation from the leftmost line in the block
base_x = float('inf')
for _, line_x, _ in lines_with_positions:
if line_x < base_x:
base_x = line_x
# Process each line
for line_y, line_x, line in lines_with_positions:
# Extract words from this line
line_words = []
# Sort words within line by horizontal position
words_with_positions = []
for word in line.words:
word_x = float('inf')
if hasattr(word, 'geometry'):
geometry = word.geometry
if len(geometry) >= 2:
word_x = min(point[0] for point in geometry)
words_with_positions.append((word_x, word))
# Sort words by horizontal position
words_with_positions.sort(key=lambda x: x[0])
# Extract word text
for _, word in words_with_positions:
line_words.append(word.value)
if line_words:
# Calculate relative indentation
if base_x != float('inf') and line_x != float('inf'):
# Convert relative position difference to approximate spaces
# This is a heuristic - adjust the multiplier (50) based on your needs
relative_indent = max(0, int((line_x - base_x) * 50))
indentation = ' ' * relative_indent
else:
indentation = ''
# Join words in the line with spaces
line_text = indentation + ' '.join(line_words)
block_lines.append(line_text)
# Join lines in the block with newlines
if block_lines:
block_text = '\n'.join(block_lines)
extracted_text_blocks.append(block_text)
# Join blocks with double newlines to separate paragraphs/sections
final_text = '\n\n'.join(extracted_text_blocks).strip()
# Clean up excessive whitespace while preserving intentional formatting
lines = final_text.split('\n')
cleaned_lines = []
for line in lines:
# Preserve leading spaces but clean up excessive internal spacing
leading_spaces = len(line) - len(line.lstrip())
cleaned_content = ' '.join(line.split())
if cleaned_content: # Only add non-empty lines
cleaned_lines.append(' ' * leading_spaces + cleaned_content)
else:
cleaned_lines.append('') # Preserve empty lines
return '\n'.join(cleaned_lines)
# Extract and format the text
return doc_result_to_formatted_text(result)
except Exception as e:
print(f"Error performing OCR: {e}")
@@ -205,4 +219,229 @@ def copy_to_clipboard(text: str) -> bool:
return False
def annotate_image_with_ocr_results(
image_path: str,
result,
output_path: Optional[str] = None,
show_words: bool = True,
show_lines: bool = False,
show_blocks: bool = False,
show_text: bool = False,
word_color: Tuple[int, int, int, int] = (255, 0, 0, 128), # Red with transparency
line_color: Tuple[int, int, int, int] = (0, 255, 0, 128), # Green with transparency
block_color: Tuple[int, int, int, int] = (0, 0, 255, 128), # Blue with transparency
text_color: Tuple[int, int, int] = (255, 255, 255), # White text
box_width: int = 2
) -> str:
"""
Annotate an image with OCR detection results, showing bounding boxes around detected text.
Args:
image_path: Path to the original image
result: DocTR OCR result object
output_path: Optional path to save annotated image (if None, creates one based on input)
show_words: Whether to show word-level bounding boxes
show_lines: Whether to show line-level bounding boxes
show_blocks: Whether to show block-level bounding boxes
show_text: Whether to overlay detected text on the image
word_color: RGBA color for word bounding boxes
line_color: RGBA color for line bounding boxes
block_color: RGBA color for block bounding boxes
text_color: RGB color for text overlay
box_width: Width of bounding box lines
Returns:
Path to the annotated image file
"""
try:
# Load the original image
image = Image.open(image_path).convert('RGBA')
width, height = image.size
# Create a transparent overlay for drawing
overlay = Image.new('RGBA', image.size, (0, 0, 0, 0))
draw = ImageDraw.Draw(overlay)
# Try to load a font for text overlay
try:
font = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 12)
except (OSError, IOError):
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
except (OSError, IOError):
font = ImageFont.load_default()
# Process each page
for page in result.pages:
# Draw blocks if requested
if show_blocks:
for block in page.blocks:
if block.lines and block.lines[0].words:
# Calculate block bounding box from all words in the block
all_points = []
for line in block.lines:
for word in line.words:
if hasattr(word, 'geometry'):
geometry = word.geometry
for point in geometry:
# Convert relative coordinates to absolute
abs_x = int(point[0] * width)
abs_y = int(point[1] * height)
all_points.append((abs_x, abs_y))
if all_points:
min_x = min(p[0] for p in all_points)
max_x = max(p[0] for p in all_points)
min_y = min(p[1] for p in all_points)
max_y = max(p[1] for p in all_points)
draw.rectangle(
[min_x, min_y, max_x, max_y],
outline=block_color,
width=box_width
)
# Draw lines if requested
if show_lines:
for block in page.blocks:
for line in block.lines:
if line.words:
# Calculate line bounding box from all words in the line
all_points = []
for word in line.words:
if hasattr(word, 'geometry'):
geometry = word.geometry
for point in geometry:
# Convert relative coordinates to absolute
abs_x = int(point[0] * width)
abs_y = int(point[1] * height)
all_points.append((abs_x, abs_y))
if all_points:
min_x = min(p[0] for p in all_points)
max_x = max(p[0] for p in all_points)
min_y = min(p[1] for p in all_points)
max_y = max(p[1] for p in all_points)
draw.rectangle(
[min_x, min_y, max_x, max_y],
outline=line_color,
width=box_width
)
# Draw words (most detailed level)
if show_words:
for block in page.blocks:
for line in block.lines:
for word in line.words:
if hasattr(word, 'geometry'):
geometry = word.geometry
if len(geometry) >= 4: # Should be a polygon with at least 4 points
# Convert relative coordinates to absolute
abs_points = []
for point in geometry:
abs_x = int(point[0] * width)
abs_y = int(point[1] * height)
abs_points.append((abs_x, abs_y))
# Draw the polygon outline
draw.polygon(abs_points, outline=word_color, width=box_width)
# Optionally overlay the detected text
if show_text and hasattr(word, 'value'):
# Position text at the top-left of the bounding box
min_x = min(p[0] for p in abs_points)
min_y = min(p[1] for p in abs_points)
# Draw text with black outline for better visibility
for dx in [-1, 0, 1]:
for dy in [-1, 0, 1]:
if dx != 0 or dy != 0:
draw.text(
(min_x + dx, min_y + dy),
word.value,
font=font,
fill=(0, 0, 0) # Black outline
)
# Draw the main text
draw.text(
(min_x, min_y),
word.value,
font=font,
fill=text_color
)
# Composite the overlay onto the original image
annotated = Image.alpha_composite(image, overlay)
# Convert back to RGB for saving
annotated = annotated.convert('RGB')
# Generate output path if not provided
if output_path is None:
base_path = os.path.splitext(image_path)[0]
output_path = f"{base_path}_annotated.png"
# Save the annotated image
annotated.save(output_path)
return output_path
except Exception as e:
print(f"Error annotating image: {e}")
return ""
def perform_ocr_with_annotation(
image_path: str,
lang: str = 'eng',
create_annotated: bool = False,
annotation_output_path: Optional[str] = None,
**annotation_kwargs
) -> Tuple[str, str]:
"""
Perform OCR and optionally create an annotated version of the image.
Args:
image_path: Path to the image file
lang: Language code for OCR (default: 'eng')
create_annotated: Whether to create an annotated image
annotation_output_path: Optional path for annotated image
**annotation_kwargs: Additional arguments for annotation function
Returns:
Tuple of (extracted_text, annotated_image_path)
annotated_image_path will be empty string if create_annotated is False
"""
try:
# Load the OCR model
model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)
# Load the document from the image file
doc = DocumentFile.from_images(image_path)
# Run OCR on the document
result = model(doc)
# Extract and format the text
extracted_text = doc_result_to_formatted_text(result)
# Create annotated image if requested
annotated_path = ""
if create_annotated:
annotated_path = annotate_image_with_ocr_results(
image_path,
result,
annotation_output_path,
**annotation_kwargs
)
return extracted_text, annotated_path
except Exception as e:
print(f"Error performing OCR with annotation: {e}")
return "", ""