Skip to content

Commit

Permalink
initial commit with WIP claude code in src
Browse files Browse the repository at this point in the history
  • Loading branch information
schipp0 committed Dec 5, 2024
1 parent 095fb26 commit dc25c2d
Show file tree
Hide file tree
Showing 8,220 changed files with 1,826,290 additions and 0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
Binary file added .DS_Store
Binary file not shown.
13 changes: 13 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
max_workers: 4
batch_size: 10
timeout: 300
retry_count: 3

ocr:
language: eng
psm: 3
oem: 3
char_whitelist: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789
additional_params:
preserve_interword_spaces: 1
textord_heavy_nr: 1
Binary file added input/1000_WinningTheGeneticLottery.pdf
Binary file not shown.
Binary file added output/.DS_Store
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"document": "input/1000_WinningTheGeneticLottery.pdf",
"timestamp": "2024-12-01T06:44:09.284220",
"corrections": {}
}
Binary file added output/logs/.DS_Store
Binary file not shown.
32 changes: 32 additions & 0 deletions output/ocr_results/1000_WinningTheGeneticLottery_ocr_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"0": {
"status": "success",
"corrections": [],
"confidence": 3.0
},
"1": {
"status": "success",
"corrections": [],
"confidence": 0
},
"2": {
"status": "success",
"corrections": [],
"confidence": 19.5
},
"3": {
"status": "success",
"corrections": [],
"confidence": 43.8955223880597
},
"4": {
"status": "success",
"corrections": [],
"confidence": 46.30434782608695
},
"5": {
"status": "success",
"corrections": [],
"confidence": 39.833333333333336
}
}
Binary file added output/reports/.DS_Store
Binary file not shown.
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
PyMuPDF==1.23.7 # For PDF processing
numpy>=1.21.0
pandas>=1.3.0
pytesseract>=0.3.8
opencv-python>=4.5.0
Pillow>=8.0.0
pdf2image>=1.16.0
tqdm>=4.62.0
pyyaml>=5.4.0
8 changes: 8 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from setuptools import setup, find_packages

setup(
name="accessibility_checker",
version="0.1",
packages=find_packages(),
package_dir={'': 'src'},
)
Binary file added src/.DS_Store
Binary file not shown.
Binary file added src/__pycache__/batch.cpython-312.pyc
Binary file not shown.
Binary file added src/__pycache__/config.cpython-312.pyc
Binary file not shown.
3 changes: 3 additions & 0 deletions src/accessibility_checker.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Metadata-Version: 2.1
Name: accessibility_checker
Version: 0.1
5 changes: 5 additions & 0 deletions src/accessibility_checker.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
setup.py
src/accessibility_checker.egg-info/PKG-INFO
src/accessibility_checker.egg-info/SOURCES.txt
src/accessibility_checker.egg-info/dependency_links.txt
src/accessibility_checker.egg-info/top_level.txt
1 change: 1 addition & 0 deletions src/accessibility_checker.egg-info/dependency_links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions src/accessibility_checker.egg-info/top_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

142 changes: 142 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from pathlib import Path
import argparse
import yaml
import logging
from datetime import datetime
import sys

# Add the src directory to Python path
sys.path.append(str(Path(__file__).parent))

from processor.batch import BatchPDFProcessor
from processor.accessibility import AccessibilityDocumentProcessor
from processor.enhance import EnhancedAccessibilityChecker
from processor.retry import RetryHandler, RetryConfig, ProcessingError, OCRError, OCRTimeoutError
from processor.ocr import OCRProcessor
from processor.config import ConfigurationManager

def setup_logging(log_dir: Path) -> None:
"""Set up logging configuration."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"accessibility_checker_{timestamp}.log"

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)

def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description='PDF Accessibility Checker')

parser.add_argument(
'--input-dir',
type=str,
required=True,
help='Directory containing PDF files to process'
)

parser.add_argument(
'--output-dir',
type=str,
required=True,
help='Directory for output files and reports'
)

parser.add_argument(
'--config',
type=str,
default='config.yaml',
help='Path to configuration file'
)

parser.add_argument(
'--workers',
type=int,
help='Number of worker processes (overrides config file)'
)

parser.add_argument(
'--ocr-timeout',
type=int,
default=30,
help='Timeout in seconds for OCR processing'
)

return parser.parse_args()

def main():
# Parse command line arguments
args = parse_arguments()

# Create output directory structure
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

# Set up logging
log_dir = output_dir / "logs"
log_dir.mkdir(exist_ok=True)
setup_logging(log_dir)

try:
# Load configuration
logging.info(f"Loading configuration from {args.config}")
config_manager = ConfigurationManager(args.config)
processing_config = config_manager.get_processing_config()

# Override max_workers if specified in command line arguments
if args.workers:
processing_config.max_workers = args.workers

# Initialize processors with retry capability
retry_handler = RetryHandler(RetryConfig(
max_attempts=processing_config.retry_count,
base_delay=2.0,
max_delay=30.0,
jitter=True
))

# Initialize OCR processor with retry and timeout
ocr_processor = OCRProcessor(
config=processing_config.ocr_settings,
retry_handler=retry_handler,
timeout=30
)

# Initialize batch processor with OCR capability
processor = BatchPDFProcessor(
input_dir=args.input_dir,
output_dir=args.output_dir,
max_workers=processing_config.max_workers,
ocr_processor=ocr_processor,
retry_handler=retry_handler
)

# Process files
logging.info("Starting batch processing")
summary_df = processor.process_directory()

# Print processing summary
success_count = sum(summary_df['status'] == 'success')
failed_count = sum(summary_df['status'] == 'failed')
ocr_count = sum(summary_df.get('ocr_applied', False))

print("\nProcessing Summary:")
print(f"Total files processed: {len(summary_df)}")
print(f"Successfully processed: {success_count}")
print(f"Failed: {failed_count}")
print(f"OCR applied: {ocr_count}")
print(f"\nOutput files located in: {args.output_dir}")

logging.info("Processing completed successfully")

except Exception as e:
logging.error(f"Error in main process: {str(e)}", exc_info=True)
raise

if __name__ == "__main__":
main()
Binary file not shown.
Binary file not shown.
Binary file added src/processor/__pycache__/batch.cpython-311.pyc
Binary file not shown.
Binary file added src/processor/__pycache__/batch.cpython-312.pyc
Binary file not shown.
Binary file added src/processor/__pycache__/config.cpython-311.pyc
Binary file not shown.
Binary file added src/processor/__pycache__/enhance.cpython-311.pyc
Binary file not shown.
Binary file added src/processor/__pycache__/ocr.cpython-311.pyc
Binary file not shown.
Binary file added src/processor/__pycache__/retry.cpython-311.pyc
Binary file not shown.
Loading

0 comments on commit dc25c2d

Please sign in to comment.