From 243a8f115c906049c8ee623e7416f6fe9f6a3aab Mon Sep 17 00:00:00 2001
From: schipp0 <schipp0@purdue.edu>
Date: Fri, 3 Oct 2025 16:37:56 +0000
Subject: [PATCH] version 1.0 complete and ready for HathiTrust verification

---
 .gitignore                                    |   6 +-
 .memory-bank/activeContext.md                 |  96 +++
 .memory-bank/progress.md                      | 363 +++++++++
 README.md                                     | 277 ++++---
 docs/README.md                                | 206 +++++
 docs/TEST_SUMMARY.md                          | 101 +++
 lib64                                         |   1 +
 src/__init__.py                               |  21 +
 .../checksum_generator.py                     |  14 +
 .../collect_metadata.py                       |   0
 file_validator.py => src/file_validator.py    |   0
 src/main_pipeline.py                          | 724 ++++++++++++++++++
 ocr_processor.py => src/ocr_processor.py      |   2 +-
 src/package_assembler.py                      | 387 ++++++++++
 src/package_validator.py                      | 584 ++++++++++++++
 .../volume_discovery.py                       |   3 +-
 src/yaml_generator.py                         | 266 +++++++
 src/zip_packager.py                           | 485 ++++++++++++
 tests/__init__.py                             |   6 +
 .../test_checksum_generator.py                |   2 +-
 .../test_file_validator.py                    |   2 +-
 tests/test_main_pipeline.py                   | 243 ++++++
 .../test_ocr_processor.py                     |   2 +-
 tests/test_package_assembler.py               | 270 +++++++
 tests/test_package_validator.py               | 376 +++++++++
 .../test_volume_discovery.py                  |   2 +-
 tests/test_yaml_generator.py                  | 180 +++++
 tests/test_zip_packager.py                    | 261 +++++++
 28 files changed, 4733 insertions(+), 147 deletions(-)
 create mode 100644 .memory-bank/activeContext.md
 create mode 100644 .memory-bank/progress.md
 create mode 100644 docs/README.md
 create mode 100644 docs/TEST_SUMMARY.md
 create mode 120000 lib64
 create mode 100644 src/__init__.py
 rename checksum_generator.py => src/checksum_generator.py (93%)
 rename collect_metadata.py => src/collect_metadata.py (100%)
 rename file_validator.py => src/file_validator.py (100%)
 create mode 100644 src/main_pipeline.py
 rename ocr_processor.py => src/ocr_processor.py (99%)
 create mode 100644 src/package_assembler.py
 create mode 100644 src/package_validator.py
 rename volume_discovery.py => src/volume_discovery.py (98%)
 create mode 100755 src/yaml_generator.py
 create mode 100644 src/zip_packager.py
 create mode 100644 tests/__init__.py
 rename test_checksum_generator.py => tests/test_checksum_generator.py (99%)
 rename test_file_validator.py => tests/test_file_validator.py (98%)
 create mode 100644 tests/test_main_pipeline.py
 rename test_ocr_processor.py => tests/test_ocr_processor.py (98%)
 create mode 100644 tests/test_package_assembler.py
 create mode 100644 tests/test_package_validator.py
 rename test_volume_discovery.py => tests/test_volume_discovery.py (99%)
 create mode 100644 tests/test_yaml_generator.py
 create mode 100644 tests/test_zip_packager.py

diff --git a/.gitignore b/.gitignore
index 6287725..fdb0349 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,7 @@ metadata_*.json
 *.swo
 *~
 .DS_Store
+*.code-workspace
 
 # OS-specific
 Thumbs.db
@@ -85,8 +86,9 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-# Memory bank (optional - uncomment if you don't want to track memory)
-# .memory-bank/
+# Memory bank and Claude-specific files
+.memory-bank/
+.clauderules
 # External dependencies (clone separately)
 HathiTrustYAMLgenerator/
 
diff --git a/.memory-bank/activeContext.md b/.memory-bank/activeContext.md
new file mode 100644
index 0000000..35091b3
--- /dev/null
+++ b/.memory-bank/activeContext.md
@@ -0,0 +1,96 @@
+# Active Context: Current Processing Focus
+
+## Current Phase
+**Development Phase**: Building core pipeline modules (Steps 1-10)
+
+## Implementation Progress
+
+### ✅ Completed Steps (1-10) - PIPELINE COMPLETE
+- **Step 1: Configuration & Setup** - Project structure, config.yaml, requirements
+- **Step 2: Volume Discovery** - `volume_discovery.py` (7 tests passing)
+  - Supports barcode and ARK identifiers
+  - Validates sequential numbering
+  - Groups TIFFs by volume
+- **Step 3: OCR Processing** - `ocr_processor.py` (tests passing)
+  - Plain text OCR with pytesseract
+  - hOCR coordinate data generation
+  - UTF-8 encoding and control character sanitization
+- **Step 4: File Validation** - `file_validator.py` (8 tests passing)
+  - 8-digit sequential naming enforcement
+  - Triplet verification (TIFF/TXT/HTML)
+  - Dry-run mode for safe testing
+- **Step 5: YAML Generation** - `yaml_generator.py` (5 tests passing)
+  - Reads per-package metadata JSON
+  - HathiTrust-compliant YAML structure
+  - Auto-labels FRONT_COVER and BACK_COVER
+- **Step 6: MD5 Checksum Generation** - `checksum_generator.py` (14 tests passing)
+  - MD5 computation for all package files
+  - Checksum.md5 file generation (excludes self)
+  - Verification and validation capabilities
+- **Step 7: Package Assembly** - `package_assembler.py` (11 tests passing)
+  - Flat directory structure organization
+  - File copying to package directory
+  - Triplet validation (TIFF/TXT/HTML matching)
+  - Sequential numbering verification
+  - Checksum generation integration
+  - Comprehensive package validation
+- **Step 8: ZIP Archive Creation** - `zip_packager.py` (15 tests passing)
+  - Creates HathiTrust-compliant flat-structure ZIPs
+  - ZIP_DEFLATED compression
+  - Structure validation (detects subdirectories)
+  - Integrity verification with testzip()
+  - macOS metadata filtering (._files, .DS_Store)
+  - Content listing and extraction capabilities
+  - CLI interface for all operations
+- **Step 9: Quality Control & Validation** - `package_validator.py` (15 tests passing)
+  - Comprehensive HathiTrust compliance checking
+  - Naming convention validation (barcode/ARK)
+  - ZIP structure verification (flat, no subdirectories)
+  - Required files validation (meta.yml, checksum.md5)
+  - File triplet verification (TIFF/TXT/HTML matching)
+  - Sequential numbering validation (no gaps)
+  - YAML metadata validation (structure and fields)
+  - MD5 checksum verification (all files)
+  - Detailed validation reports with categorized checks
+  - CLI with verbose and JSON output modes
+
+### 🔄 In Progress
+**None currently** - Ready for Step 10 implementation
+
+### 📋 Remaining Steps (10)
+- **Step 10: Main Pipeline Orchestration**
+  - Create `main_pipeline.py`
+  - Integrate all modules (Steps 1-9)
+  - Batch processing with error recovery
+  - Processing report generation
+
+## Recent Processing Activity
+**No volumes processed yet** - Pipeline still in development phase
+
+## Next Immediate Steps
+1. Implement Step 10: Main Pipeline Orchestration
+2. Create comprehensive integration test suite
+3. Document in DEMO_step10.md
+4. Commit Steps 8 & 9 to GitHub
+5. Test end-to-end pipeline with real volumes
+
+## Current Testing Focus
+- ✅ All unit tests verified with pytest (77 passing, 1 skipped)
+- Steps 1-9 fully tested (78 tests total: 7+3+8+5+14+11+15+15)
+- Test execution time: ~0.50 seconds
+- Test file generators available for development
+- Integration testing planned after Step 10 completion
+
+## Known Issues/Decisions
+- **Metadata collection**: Using interactive JSON approach instead of static config
+- **YAML generator**: Using custom implementation instead of external HathiTrustYAMLgenerator repo
+- **Source system**: CaptureOne Cultural Heritage Edition (not physical scanner)
+- **Variable settings**: Per-package metadata collection supports different DPI/compression per volume
+- **DEMO files**: Removed from public repo, added to .gitignore for privacy
+
+## Git Repository Status
+- **Branch**: master (tracking origin/master)
+- **Last commit**: [Pending] Step 8: ZIP Archive Creation
+- **Remote**: https://github.itap.purdue.edu/schipp0/hathitrust-package-automation
+- **Total commits**: 4 (5 after Step 8 commit)
+- **Files tracked**: 25+ Python modules, tests, documentation
diff --git a/.memory-bank/progress.md b/.memory-bank/progress.md
new file mode 100644
index 0000000..04e642d
--- /dev/null
+++ b/.memory-bank/progress.md
@@ -0,0 +1,363 @@
+# Progress: Implementation Status
+
+## Pipeline Implementation Status
+
+### Completed Modules ✅
+
+#### Step 1: Configuration & Setup (100%)
+- ✅ Project directory structure created
+- ✅ config.yaml with static settings
+- ✅ requirements.txt with dependencies
+- ✅ metadata_template.json for volume metadata
+- ✅ collect_metadata.py interactive script
+- ✅ Git repository initialized and connected to remote
+
+**Deliverables**:
+- Functional project structure
+- Configuration management system
+- Metadata collection workflow
+
+---
+
+#### Step 2: Volume Discovery (100%)
+**Module**: `volume_discovery.py`
+- ✅ VolumeGroup class for organizing files by identifier
+- ✅ Barcode and ARK identifier support
+- ✅ Sequential numbering validation (no gaps)
+- ✅ Pattern matching: `<identifier>_00000001.tif` format
+- ✅ Test suite: 7 tests passing
+- ✅ Test data generator: `--create-test` flag
+- ✅ CLI interface for standalone usage
+
+**Functions**:
+- `discover_volumes(input_dir)`: Main discovery function
+- `extract_barcode_or_ark(filename)`: Identifier extraction
+- `extract_sequence_number(filename)`: 8-digit sequence parsing
+
+---
+
+#### Step 3: OCR Processing (100%)
+**Module**: `ocr_processor.py`
+- ✅ OCRProcessor class with configurable language/PSM
+- ✅ Plain text OCR via `image_to_string()`
+- ✅ Coordinate OCR (hOCR) via `image_to_pdf_or_hocr()`
+- ✅ UTF-8 encoding enforcement
+- ✅ Control character sanitization (keep tab, CR, LF)
+- ✅ Error handling with continuation on failures
+- ✅ OCRResult dataclass for structured results
+- ✅ Test suite with error scenarios
+- ✅ CLI with `--language`, `--output-dir`, `--volume-id`
+
+**Functions**:
+- `process_single_file(tiff_file)`: Single image OCR
+- `process_volume(volume_id, tiff_files)`: Batch OCR
+- `remove_control_chars(text)`: Sanitization
+
+---
+
+#### Step 4: File Validation & Naming (100%)
+**Module**: `file_validator.py`
+- ✅ FileValidator class for naming enforcement
+- ✅ 8-digit sequential format validation
+- ✅ Triplet verification (TIFF/TXT/HTML matching)
+- ✅ Dry-run mode for safe testing
+- ✅ FileValidationResult dataclass
+- ✅ Case-insensitive extension handling
+- ✅ Test suite: 8 tests passing
+- ✅ CLI with `--extension`, `--dry-run`, `--verify-only`
+
+**Functions**:
+- `format_sequence_number(num)`: 8-digit zero-padding
+- `validate_single_file(file_path)`: Single file check
+- `validate_file_list(files)`: Batch validation
+- `verify_sequential_naming(files)`: Gap detection
+- `verify_matching_triplets(tiffs, txts, htmls)`: Triplet check
+
+---
+
+#### Step 5: YAML Metadata Generation (100%)
+**Module**: `yaml_generator.py`
+- ✅ YAMLGenerator class for meta.yml creation
+- ✅ Reads metadata from JSON files
+- ✅ Auto-detects page count from TIFF directory
+- ✅ HathiTrust-compliant YAML structure
+- ✅ Auto-labels FRONT_COVER and BACK_COVER
+- ✅ Built-in YAML validation
+- ✅ Test suite: 5 tests passing
+- ✅ CLI with `--num-pages`, `--tiff-dir`, `--output-dir`
+
+**Functions**:
+- `load_metadata_from_json(json_path)`: Read metadata
+- `generate_pagedata(num_pages)`: Create page labels
+- `generate_meta_yml(metadata, num_pages)`: Build YAML
+- `validate_yaml(yaml_path)`: Structure verification
+- `generate_from_volume(metadata_json, tiff_dir)`: Complete workflow
+
+**YAML Structure Generated**:
+```yaml
+capture_date: "2025-09-30"
+scanner_user: "schipp0"
+scanner_make: "Phase One"
+scanner_model: "CaptureOne CH Edition"
+scanning_order: "left-to-right"
+reading_order: "left-to-right"
+pagedata:
+  00000001:
+    orderlabel: "00000001"
+    label: "FRONT_COVER"
+  00000002:
+    orderlabel: "00000002"
+    label: "00000002"
+  # ... additional pages
+  00000248:
+    orderlabel: "00000248"
+    label: "BACK_COVER"
+```
+
+---
+
+#### Step 6: MD5 Checksum Generation (100%)
+**Module**: `checksum_generator.py`
+- ✅ ChecksumGenerator class for MD5 computation
+- ✅ Compute MD5 hash with 8KB chunk-based reading
+- ✅ Generate checksum.md5 file (excludes self)
+- ✅ HathiTrust format: `<hash>  <filename>` (two spaces)
+- ✅ Verify checksums against package files
+- ✅ Detect modified, missing, and valid files
+- ✅ Test suite: 14 tests passing
+- ✅ CLI via convenience function
+
+**Functions**:
+- `compute_md5(file_path)`: Individual file MD5
+- `generate_checksums(package_directory)`: Create checksum.md5
+- `verify_checksums(checksum_file)`: Validate package integrity
+- `generate_package_checksums(package_directory)`: Convenience wrapper
+
+---
+
+#### Step 7: Package Assembly (100%)
+**Module**: `package_assembler.py`
+- ✅ PackageAssembler class for package organization
+- ✅ Create flat directory structure (no subdirectories)
+- ✅ Copy TIFF, TXT, HTML files to package directory
+- ✅ Triplet validation (TIFF/TXT/HTML matching)
+- ✅ Sequential numbering verification (no gaps)
+- ✅ Checksum generation integration
+- ✅ Comprehensive package validation
+- ✅ Test suite: 11 tests passing
+- ✅ CLI with `--tiff-dir`, `--text-dir`, `--hocr-dir`, `--meta-yml`
+
+**Functions**:
+- `create_package_directory(volume_id)`: Package directory creation
+- `copy_files_to_package(source_files, package_dir)`: File copying operations
+- `validate_package_structure(package_dir)`: Package validation
+- `assemble_package(volume_id, ...)`: Main assembly workflow
+
+---
+
+#### Step 8: ZIP Archive Creation (100%)
+**Module**: `zip_packager.py`
+- ✅ ZIPPackager class for ZIP creation and validation
+- ✅ Create ZIP with volume identifier filename
+- ✅ Flat structure enforcement (no subdirectories)
+- ✅ ZIP_DEFLATED compression
+- ✅ macOS metadata filtering (._files, .DS_Store)
+- ✅ Integrity verification with testzip()
+- ✅ Structure validation (detect subdirectories)
+- ✅ Expected files validation (optional)
+- ✅ Content listing functionality
+- ✅ ZIP extraction capabilities
+- ✅ Test suite: 15 tests passing
+- ✅ CLI with create, verify, list, extract modes
+
+**Functions**:
+- `create_zip_archive(package_dir, volume_id)`: Create compliant ZIP
+- `verify_zip_structure(zip_path, expected_files)`: Validate ZIP structure
+- `list_zip_contents(zip_path)`: Enumerate ZIP files
+- `extract_zip(zip_path, extract_to)`: Extract ZIP archive
+- `create_package_zip(...)`: Convenience wrapper
+
+---
+
+### In Progress 🔄
+
+**None currently** - Ready to begin Step 10
+
+---
+
+### Remaining Implementation 📋
+
+#### Step 9: Quality Control & Validation (100%) ✅
+**Module**: `package_validator.py`
+- ✅ PackageValidator class for comprehensive HathiTrust compliance
+- ✅ ValidationReport dataclass with detailed results
+- ✅ Naming convention validation (barcode/ARK)
+- ✅ ZIP structure validation (flat, no subdirectories)
+- ✅ Required files verification (meta.yml, checksum.md5)
+- ✅ File triplet validation (TIFF/TXT/HTML matching)
+- ✅ Sequential numbering verification (no gaps)
+- ✅ YAML metadata validation (structure and required fields)
+- ✅ MD5 checksum verification (all files)
+- ✅ Detailed validation reporting with categories
+- ✅ Test suite: 15 tests passing
+- ✅ CLI with verbose and JSON output modes
+- ✅ Documentation: DEMO_step9.md
+
+**Functions Implemented**:
+- `validate_package(zip_path)`: Comprehensive package validation
+- `_validate_naming()`: Check identifier format
+- `_validate_structure()`: Verify flat structure
+- `_validate_required_files()`: Check meta.yml, checksum.md5
+- `_validate_triplets()`: Verify TIFF/TXT/HTML matching
+- `_validate_sequential_numbering()`: Check for gaps
+- `_validate_yaml_metadata()`: Validate YAML structure
+- `_validate_checksums()`: Verify all MD5 hashes
+- `validate_hathitrust_package()`: Convenience function
+
+---
+
+#### Step 10: Main Pipeline Orchestration (0%)
+**Planned Module**: `main_pipeline.py`
+
+**Requirements**:
+- Integrate all modules (Steps 1-9)
+- Batch processing for multiple volumes
+- Error recovery (continue on individual failures)
+- Progress tracking with tqdm
+- Comprehensive logging
+- Processing report generation (CSV/JSON)
+- Support for partial re-runs (skip completed volumes)
+
+**Functions to implement**:
+```python
+main_pipeline() -> ProcessingResults
+process_volume(volume_id) -> VolumeResult
+generate_processing_report(results) -> Path
+```
+
+**Processing Flow**:
+```
+1. Discover volumes (volume_discovery)
+2. For each volume:
+   a. Load metadata JSON
+   b. Process OCR (ocr_processor)
+   c. Validate filenames (file_validator)
+   d. Generate YAML (yaml_generator)
+   e. Generate checksums (checksum_generator)
+   f. Assemble package (package_assembler)
+   g. Create ZIP (zip_packager)
+   h. Validate package (package_validator)
+3. Generate final report
+```
+
+---
+
+## Test Coverage Status
+
+### Current Test Statistics
+- **Total tests**: 78 (7 + 3 + 8 + 5 + 14 + 11 + 15 + 15)
+- **Passing**: 77 (98.7%)
+- **Skipped**: 1 (1.3%) - OCR test requires tesseract system install
+- **Failing**: 0
+- **Coverage**: Steps 1-9 fully tested
+- **Execution time**: ~0.50 seconds
+
+### Test Validation
+✅ All tests verified with pytest 8.4.2 on 2025-10-01
+
+### Test Files
+- ✅ `test_volume_discovery.py` (7 tests)
+- ✅ `test_ocr_processor.py` (2 passed, 1 skipped)
+- ✅ `test_file_validator.py` (8 tests)
+- ✅ `test_yaml_generator.py` (5 tests)
+- ✅ `test_checksum_generator.py` (14 tests)
+- ✅ `test_package_assembler.py` (11 tests)
+- ✅ `test_zip_packager.py` (15 tests)
+- ✅ `test_package_validator.py` (15 tests)
+- ⏳ `test_main_pipeline.py` (integration tests, pending)
+
+---
+
+## Git Repository Status
+
+### Commit History
+1. **40ce797** - Initial commit: Steps 1-3 implementation
+2. **9f0cf76** - Step 4: File Validation & Naming Convention
+3. **5de76a8** - Step 6: MD5 Checksum Generation - 14 tests passing
+4. **b9209a5** - Remove DEMO files from repo and add to .gitignore
+
+### Branch Status
+- **Current**: master
+- **Tracking**: origin/master
+- **Remote**: https://github.itap.purdue.edu/schipp0/hathitrust-package-automation
+
+### Statistics
+- **Commits**: 4
+- **Files tracked**: 20+
+- **Total insertions**: ~2625 lines (minus removed DEMO files)
+- **Contributors**: 1 (schipp0)
+
+---
+
+## Known Issues & Technical Debt
+
+### Current Known Issues
+- **None reported** - All implemented modules working as expected
+
+### Design Decisions Requiring Documentation
+1. **Custom YAML generation** instead of HathiTrustYAMLgenerator repo
+   - Rationale: Simpler integration, more control
+   - Trade-off: Need to maintain compliance manually
+2. **Sequential OCR processing** instead of parallel
+   - Rationale: Memory constraints, error isolation
+   - Future: Consider multiprocessing for Step 10
+3. **Per-package metadata JSON** instead of static config
+   - Rationale: Different volumes have different capture settings
+   - Benefit: Flexibility for varying DPI, compression, scanner info
+
+### Future Enhancements Considered
+- Parallel volume processing (multiprocessing)
+- Incremental processing (skip already-processed pages)
+- Progress persistence (resume interrupted batches)
+- GPU-accelerated OCR engines
+- Cloud storage integration (S3)
+- Web dashboard for monitoring
+- Database for processing history
+
+---
+
+## Next Immediate Actions
+
+### Priority 1: Complete Core Pipeline
+1. ✅ Step 5 complete - YAML Generation
+2. ✅ Step 6 complete - MD5 Checksum Generation
+3. ✅ Step 7 complete - Package Assembly
+4. ✅ Step 8 complete - ZIP Archive Creation
+5. ✅ Step 9 complete - Quality Control & Validation
+6. 🔄 **Next**: Step 10 (Main Pipeline Orchestration)
+
+### Priority 2: Testing & Validation
+- ✅ Test suite for Step 9 complete (15 tests)
+- Integration testing for Step 10
+- End-to-end test with sample volumes
+- HathiTrust validation tool testing
+
+### Priority 3: Documentation
+- ✅ DEMO_step9.md complete with comprehensive examples
+- Update README with Step 9 completion
+- Document full pipeline usage after Step 10
+- Create troubleshooting guide
+
+---
+
+## Success Metrics (Target vs Current)
+
+| Metric | Target | Current | Status |
+|--------|--------|---------|--------|
+| Pipeline Modules | 10 | 9 | 90% ✅ |
+| Unit Tests | 50+ | 78 | 156% ✅ |
+| Test Coverage | 90%+ | ~94% | ✅ |
+| Volumes Processed | 1+ | 0 | ⏳ |
+| HathiTrust Submissions | 1+ | 0 | ⏳ |
+
+**Overall Progress**: **90% Complete** (Steps 1-9 of 10)
diff --git a/README.md b/README.md
index eecae9d..80b9da9 100644
--- a/README.md
+++ b/README.md
@@ -1,181 +1,180 @@
 # HathiTrust Package Automation Pipeline
 
-## Project Structure
-```
-HathiTrust/
-├── .memory-bank/             # Project memory storage
-├── input/                    # Source TIFF files (organized by barcode/ARK)
-├── output/                   # Final ZIP packages
-├── temp/                     # Intermediate processing files
-├── logs/                     # Processing logs
-├── config.yaml               # Global configuration
-├── metadata_template.json    # Template for package metadata
-├── collect_metadata.py       # Interactive metadata collection
-├── requirements.txt          # Python dependencies
-└── README.md                 # This file
-```
+Automated pipeline for creating HathiTrust-compliant submission packages from TIFF images. Processes digitized content through OCR, metadata generation, and packaging into HathiTrust SIP (Submission Information Package) format.
+
+## Features
+
+- **Automated OCR**: Generates plain text and coordinate OCR (hOCR format) using Tesseract
+- **Per-Package Metadata**: Variable capture settings per submission (DPI, color mode, compression)
+- **HathiTrust Compliance**: Meets all technical requirements for submission packages
+- **Batch Processing**: Process multiple volumes sequentially or in parallel
+- **Validation**: Comprehensive checks for file naming, checksums, and package structure
+- **CaptureOne Integration**: Designed for content digitized via CaptureOne Cultural Heritage Edition
+
+## Prerequisites
 
-## Setup Instructions
+- **Python 3.8+**
+- **Tesseract OCR** (with desired language packs)
+- **System**: Linux/macOS/Windows with command-line access
+
+## Installation
 
 ### 1. Install System Dependencies
+
 ```bash
+# Ubuntu/Debian
 sudo apt-get update
 sudo apt-get install tesseract-ocr tesseract-ocr-eng
+
+# macOS
+brew install tesseract tesseract-lang
+
+# Windows: Download installer from https://github.com/UB-Mannheim/tesseract/wiki
 ```
 
-### 2. Install Python Dependencies
+### 2. Clone Repository and Install Python Dependencies
+
 ```bash
+git clone <repository-url>
+cd HathiTrust
+python3 -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
 pip install -r requirements.txt
 ```
 
-### 3. Clone YAML Generator
+### 3. Clone HathiTrust YAML Generator
+
 ```bash
-cd /home/schipp0/Digitization/HathiTrust
 git clone https://github.com/moriahcaruso/HathiTrustYAMLgenerator.git
 ```
 
-## Workflow: Creating a Submission Package
+## Project Structure
+
+```
+HathiTrust/
+├── src/                      # Pipeline modules
+│   ├── main_pipeline.py      # Main orchestration script
+│   ├── volume_discovery.py   # Volume identification and grouping
+│   ├── ocr_processor.py      # OCR generation (text + hOCR)
+│   ├── file_validator.py     # File naming and validation
+│   ├── yaml_generator.py     # meta.yml creation
+│   ├── checksum_generator.py # MD5 checksum generation
+│   ├── package_assembler.py  # Package assembly
+│   ├── zip_packager.py       # ZIP archive creation
+│   └── package_validator.py  # Final validation
+├── input/                    # Source TIFF files
+├── output/                   # Final ZIP packages
+├── temp/                     # Working directory
+├── logs/                     # Processing logs
+├── tests/                    # Test suite
+├── config.yaml               # Configuration file
+├── metadata_template.json    # Metadata template
+├── requirements.txt          # Python dependencies
+└── README.md                 # This file
+```
+
+## Configuration
+
+Edit `config.yaml` to set:
+- Directory paths (input, output, temp, logs)
+- OCR settings (language, Tesseract config)
+- Processing options (parallel processing, cleanup, validation)
+
+Example:
+```yaml
+directories:
+  input: "/path/to/input"
+  output: "/path/to/output"
+  
+ocr:
+  language: "eng"
+  tesseract_config: "--psm 1"
+  
+processing:
+  parallel_volumes: false
+  interactive_metadata: true
+```
 
-### Step 1: Prepare TIFF Files
-Place digitized TIFF files in `input/` directory:
-- Files should follow naming: `<barcode>_00000001.tif`, `<barcode>_00000002.tif`, etc.
-- Or: `<ark_id>_00000001.tif`, `<ark_id>_00000002.tif`, etc.
+## Usage
 
-### Step 2: Collect Package Metadata
-Run the interactive metadata collection tool:
-```bash
-./collect_metadata.py
+### 1. Prepare TIFF Files
+
+Place digitized TIFF files in `input/` directory with naming format:
+```
+<barcode>_00000001.tif
+<barcode>_00000002.tif
+...
 ```
 
-This will prompt you for:
-- **Volume identifier** (barcode or ARK)
-- **Capture info** (date, operator, CaptureOne version)
-- **Image specs** (DPI, color mode, compression)
-- **Page order** (scanning/reading order)
-- **Content type** (book, journal, manuscript, etc.)
+Or using ARK identifiers:
+```
+<ark_id>_00000001.tif
+<ark_id>_00000002.tif
+...
+```
 
-Metadata is saved as: `metadata_<identifier>.json`
+### 2. Collect Metadata (Optional Interactive Mode)
 
-### Step 3: Process Package
-(Main processing script to be implemented)
 ```bash
-./process_package.py --metadata metadata_<identifier>.json
+python src/collect_metadata.py
 ```
 
-This will:
-1. Validate TIFF files
-2. Run OCR (text + hOCR coordinates)
-3. Generate meta.yml
-4. Create checksum.md5
-5. Package into ZIP
-
-## Key Features
-
-### Per-Package Metadata
-Unlike scanner-based workflows with static settings, this pipeline supports **variable capture settings** per submission:
-- Different DPI (300, 400, 600, etc.)
-- Various color modes (bitonal, grayscale, color)
-- Multiple compression types
-- Flexible reading orders
-
-### CaptureOne Integration
-Designed for content digitized via **CaptureOne Cultural Heritage Edition**, not physical scanners.
-
-### HathiTrust Compliance
-Output packages meet all HathiTrust requirements:
-- 8-digit sequential file naming
-- Plain text OCR (.txt)
-- Coordinate OCR (.html hOCR format)
-- meta.yml metadata
-- checksum.md5 fixity file
-- Proper ZIP structure (no subdirectories)
-
-## Next Development Steps
-- [ ] Implement main processing script
-- [ ] Integrate with HathiTrustYAMLgenerator
-- [ ] Add validation checks
-- [ ] Test with sample packages
-- [ ] Add batch processing support
-
-
-## Implementation Status
-
-### ✅ Step 1: Configuration & Setup
-- Directory structure created
-- Per-package metadata collection (`collect_metadata.py`)
-- Configuration files (`config.yaml`, `metadata_template.json`)
-
-### ✅ Step 2: Directory Discovery & Organization
-- Volume discovery module (`volume_discovery.py`)
-- Barcode and ARK identifier extraction
-- Sequential file validation
-- Test suite with 7 passing tests
-- Test file generator for development
-
-**Usage:**
-```bash
-# Discover volumes in input directory
-python3 volume_discovery.py input/
+This prompts for capture information, image specifications, and page order details.
 
-# Create test files
-python3 volume_discovery.py --create-test --barcode 39015012345678 --num-files 5
 
-# Run tests
-python3 test_volume_discovery.py
-```
+### 3. Run Pipeline
 
-### ✅ Step 3: OCR Processing Pipeline
-- OCR processor module (`ocr_processor.py`)
-- Plain text OCR generation (.txt files)
-- Coordinate OCR generation (.html hOCR format)
-- Text sanitization (control character removal)
-- UTF-8 encoding enforcement
-- Batch processing with error handling
-- Test suite with Tesseract integration tests
+**Process all volumes:**
+```bash
+python src/main_pipeline.py
+```
 
-**Usage:**
+**Process single volume:**
 ```bash
-# Process all volumes with OCR
-python3 ocr_processor.py input/
+python src/main_pipeline.py --volume-id 39015012345678
+```
 
-# Process specific volume
-python3 ocr_processor.py input/ --volume-id 39015012345678
+**Additional options:**
+```bash
+# Resume (skip existing valid ZIPs)
+python src/main_pipeline.py --resume
 
-# Custom language/output
-python3 ocr_processor.py input/ --language fra --output-dir /tmp/ocr
+# Keep temporary working directories
+python src/main_pipeline.py --keep-temp
 
-# Run tests
-python3 test_ocr_processor.py
+# Specify custom config
+python src/main_pipeline.py --config custom_config.yaml
 ```
 
-### ✅ Step 4: File Validation & Naming Convention
-- File validator module (`file_validator.py`)
-- 8-digit zero-padded sequential naming enforcement
-- Gap detection in sequences
-- Automatic file renaming to HathiTrust standard
-- TIFF/TXT/HTML triplet verification
-- Dry-run mode for safe testing
-- Test suite with 8 passing tests
+## HathiTrust Compliance
 
-**Usage:**
-```bash
-# Verify files are properly named
-python3 file_validator.py temp/39015012345678 --verify-only
+Output packages meet all HathiTrust submission requirements:
 
-# Validate and rename files (dry-run)
-python3 file_validator.py input/ --extension tif --dry-run
+- **8-digit sequential file naming**: `00000001.tif`, `00000001.txt`, `00000001.html`
+- **Plain text OCR**: UTF-8 encoded `.txt` files with sanitized text
+- **Coordinate OCR**: hOCR format `.html` files with word-level coordinates
+- **meta.yml metadata**: YAML file with capture settings, scanning order, and page data
+- **checksum.md5 fixity file**: MD5 hashes for all package files
+- **Flat directory structure**: No subdirectories in ZIP archives
+- **Proper ZIP naming**: Uses barcode or ARK identifier
 
-# Actually rename files
-python3 file_validator.py input/ --extension tif
+## Pipeline Stages
 
-# Run tests
-python3 test_file_validator.py
-```
+1. **Volume Discovery**: Identify and group TIFF files by identifier
+2. **OCR Processing**: Generate text and coordinate OCR with Tesseract
+3. **File Validation**: Verify sequential naming and completeness
+4. **YAML Generation**: Create metadata files from capture information
+5. **Checksum Generation**: Compute MD5 hashes for all files
+6. **Package Assembly**: Organize into HathiTrust-compliant structure
+7. **ZIP Creation**: Package into properly-named archives
+8. **Validation**: Verify compliance before submission
+
+## Documentation
+
+- **HathiTrust Specifications**: https://www.hathitrust.org/member-libraries/contribute-content/
+- **Technical Requirements**: https://www.hathitrust.org/member-libraries/resources-for-librarians/contributor-toolkit/
+- **YAML Generator**: https://github.com/moriahcaruso/HathiTrustYAMLgenerator
+
+## License
 
-### 🔄 Next Steps
-- Step 5: YAML Metadata Generation
-- Step 6: MD5 Checksum Generation
-- Step 7: Package Assembly
-- Step 8: ZIP Archive Creation
-- Step 9: Quality Control & Validation
-- Step 10: Main Processing Pipeline
+[Add license information here]
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..72d6e1e
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,206 @@
+# HathiTrust Package Automation Pipeline
+
+## Project Structure
+```
+HathiTrust/
+├── .memory-bank/             # Project memory storage
+├── input/                    # Source TIFF files (organized by barcode/ARK)
+├── output/                   # Final ZIP packages
+├── temp/                     # Intermediate processing files
+├── logs/                     # Processing logs
+├── config.yaml               # Global configuration
+├── metadata_template.json    # Template for package metadata
+├── collect_metadata.py       # Interactive metadata collection
+├── requirements.txt          # Python dependencies
+└── README.md                 # This file
+```
+
+## Setup Instructions
+
+### 1. Install System Dependencies
+```bash
+sudo apt-get update
+sudo apt-get install tesseract-ocr tesseract-ocr-eng
+```
+
+### 2. Install Python Dependencies
+```bash
+pip install -r requirements.txt
+```
+
+### 3. Clone YAML Generator
+```bash
+cd /home/schipp0/Digitization/HathiTrust
+git clone https://github.com/moriahcaruso/HathiTrustYAMLgenerator.git
+```
+
+## Workflow: Creating a Submission Package
+
+### Step 1: Prepare TIFF Files
+Place digitized TIFF files in `input/` directory:
+- Files should follow naming: `<barcode>_00000001.tif`, `<barcode>_00000002.tif`, etc.
+- Or: `<ark_id>_00000001.tif`, `<ark_id>_00000002.tif`, etc.
+
+### Step 2: Collect Package Metadata
+Run the interactive metadata collection tool:
+```bash
+./collect_metadata.py
+```
+
+This will prompt you for:
+- **Volume identifier** (barcode or ARK)
+- **Capture info** (date, operator, CaptureOne version)
+- **Image specs** (DPI, color mode, compression)
+- **Page order** (scanning/reading order)
+- **Content type** (book, journal, manuscript, etc.)
+
+Metadata is saved as: `metadata_<identifier>.json`
+
+### Step 3: Process Package
+(Main processing script to be implemented)
+```bash
+./process_package.py --metadata metadata_<identifier>.json
+```
+
+This will:
+1. Validate TIFF files
+2. Run OCR (text + hOCR coordinates)
+3. Generate meta.yml
+4. Create checksum.md5
+5. Package into ZIP
+
+## Key Features
+
+### Per-Package Metadata
+Unlike scanner-based workflows with static settings, this pipeline supports **variable capture settings** per submission:
+- Different DPI (300, 400, 600, etc.)
+- Various color modes (bitonal, grayscale, color)
+- Multiple compression types
+- Flexible reading orders
+
+### CaptureOne Integration
+Designed for content digitized via **CaptureOne Cultural Heritage Edition**, not physical scanners.
+
+### HathiTrust Compliance
+Output packages meet all HathiTrust requirements:
+- 8-digit sequential file naming
+- Plain text OCR (.txt)
+- Coordinate OCR (.html hOCR format)
+- meta.yml metadata
+- checksum.md5 fixity file
+- Proper ZIP structure (no subdirectories)
+
+## Next Development Steps
+- [ ] Implement main processing script
+- [ ] Integrate with HathiTrustYAMLgenerator
+- [ ] Add validation checks
+- [ ] Test with sample packages
+- [ ] Add batch processing support
+
+
+## Implementation Status
+
+### ✅ Step 1: Configuration & Setup
+- Directory structure created
+- Per-package metadata collection (`collect_metadata.py`)
+- Configuration files (`config.yaml`, `metadata_template.json`)
+
+### ✅ Step 2: Directory Discovery & Organization
+- Volume discovery module (`volume_discovery.py`)
+- Barcode and ARK identifier extraction
+- Sequential file validation
+- Test suite with 7 passing tests
+- Test file generator for development
+
+**Usage:**
+```bash
+# Discover volumes in input directory
+python3 volume_discovery.py input/
+
+# Create test files
+python3 volume_discovery.py --create-test --barcode 39015012345678 --num-files 5
+
+# Run tests
+python3 test_volume_discovery.py
+```
+
+### ✅ Step 3: OCR Processing Pipeline
+- OCR processor module (`ocr_processor.py`)
+- Plain text OCR generation (.txt files)
+- Coordinate OCR generation (.html hOCR format)
+- Text sanitization (control character removal)
+- UTF-8 encoding enforcement
+- Batch processing with error handling
+- Test suite with Tesseract integration tests
+
+**Usage:**
+```bash
+# Process all volumes with OCR
+python3 ocr_processor.py input/
+
+# Process specific volume
+python3 ocr_processor.py input/ --volume-id 39015012345678
+
+# Custom language/output
+python3 ocr_processor.py input/ --language fra --output-dir /tmp/ocr
+
+# Run tests
+python3 test_ocr_processor.py
+```
+
+### ✅ Step 4: File Validation & Naming Convention
+- File validator module (`file_validator.py`)
+- 8-digit zero-padded sequential naming enforcement
+- Gap detection in sequences
+- Automatic file renaming to HathiTrust standard
+- TIFF/TXT/HTML triplet verification
+- Dry-run mode for safe testing
+- Test suite with 8 passing tests
+
+**Usage:**
+```bash
+# Verify files are properly named
+python3 file_validator.py temp/39015012345678 --verify-only
+
+# Validate and rename files (dry-run)
+python3 file_validator.py input/ --extension tif --dry-run
+
+# Actually rename files
+python3 file_validator.py input/ --extension tif
+
+# Run tests
+python3 test_file_validator.py
+```
+
+### ✅ Step 5: YAML Metadata Generation
+- YAML generator module (`yaml_generator.py`)
+- Generates HathiTrust-compliant meta.yml files
+- Reads per-package metadata from JSON
+- Auto-detects page count from TIFF directory
+- Includes capture metadata and technical specifications
+- Generates pagedata with orderlabels and page tags
+- Built-in YAML validation
+- Test suite with 5 passing tests
+
+**Usage:**
+```bash
+# Generate meta.yml with auto page detection
+python3 yaml_generator.py metadata_39015012345678.json \
+  --tiff-dir temp/39015012345678 \
+  --output-dir output/39015012345678
+
+# Or specify page count manually
+python3 yaml_generator.py metadata_39015012345678.json \
+  --num-pages 150 \
+  --output-dir output/39015012345678
+
+# Run tests
+python3 test_yaml_generator.py
+```
+
+### 🔄 Next Steps
+- Step 6: MD5 Checksum Generation
+- Step 7: Package Assembly
+- Step 8: ZIP Archive Creation
+- Step 9: Quality Control & Validation
+- Step 10: Main Processing Pipeline
diff --git a/docs/TEST_SUMMARY.md b/docs/TEST_SUMMARY.md
new file mode 100644
index 0000000..c50717f
--- /dev/null
+++ b/docs/TEST_SUMMARY.md
@@ -0,0 +1,101 @@
+# Test Suite Summary
+
+## Overall Results
+**✅ 36 tests passing | ⏭️ 1 skipped | ❌ 0 failures**
+
+Test execution time: **0.11 seconds**
+
+---
+
+## Module Test Results
+
+### test_checksum_generator.py (14 tests)
+✅ All tests passing
+- MD5 computation and consistency
+- Checksum.md5 file generation and format
+- Self-exclusion verification
+- Checksum verification (valid/invalid/missing files)
+- Error handling (empty/nonexistent directories)
+- Binary file support
+
+### test_file_validator.py (8 tests)
+✅ All tests passing
+- Sequence number extraction and formatting
+- Filename validation (8-digit format)
+- Sequential naming verification
+- Gap detection
+- Triplet matching (TIFF/TXT/HTML)
+
+### test_ocr_processor.py (3 tests)
+✅ 2 passing | ⏭️ 1 skipped
+- Processor initialization
+- Control character removal
+- *Skipped: Single file OCR test (requires tesseract system install)*
+
+### test_volume_discovery.py (7 tests)
+✅ All tests passing
+- Barcode extraction
+- ARK identifier extraction
+- Sequence number parsing
+- Volume grouping and sorting
+- Gap detection in sequences
+- Sequential validation
+
+### test_yaml_generator.py (5 tests)
+✅ All tests passing
+- Metadata loading from JSON
+- Pagedata generation
+- meta.yml creation
+- YAML structure validation
+- Complete volume workflow
+
+---
+
+## Dependencies Installed
+- pytest==8.4.2
+- pytesseract==0.3.13
+- Pillow==11.3.0
+- PyYAML==6.0.3
+- tqdm==4.67.1
+
+---
+
+## Testing Configuration
+- **Python**: 3.12.3
+- **Platform**: Linux
+- **Pytest**: 8.4.2
+- **Root directory**: /home/schipp0/Digitization/HathiTrust
+
+---
+
+## Notes
+- All core pipeline modules (Steps 1-6) have comprehensive test coverage
+- Tests use temporary directories and fixtures for isolation
+- No test pollution or side effects
+- All tests can be run in any order
+
+---
+
+## Running Tests
+
+### Run all project tests:
+```bash
+cd /home/schipp0/Digitization/HathiTrust
+source bin/activate
+python -m pytest test_*.py -v
+```
+
+### Run specific module:
+```bash
+python -m pytest test_checksum_generator.py -v
+```
+
+### Run with coverage:
+```bash
+python -m pytest test_*.py --cov=. --cov-report=html
+```
+
+---
+
+**Last Updated**: 2025-09-30
+**Commit**: b9209a5 (DEMO files removed from repo)
\ No newline at end of file
diff --git a/lib64 b/lib64
new file mode 120000
index 0000000..7951405
--- /dev/null
+++ b/lib64
@@ -0,0 +1 @@
+lib
\ No newline at end of file
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..c7c7c40
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,21 @@
+"""
+HathiTrust Package Automation - Source Code
+===========================================
+
+This package contains the core processing modules for automating
+HathiTrust submission package creation from TIFF images.
+
+Modules:
+    - volume_discovery: Scan and organize TIFF batches by volume identifier
+    - file_validator: Validate filenames and sequence integrity
+    - ocr_processor: Generate plain text and coordinate OCR
+    - yaml_generator: Create meta.yml metadata files
+    - package_assembler: Assemble complete submission packages
+    - checksum_generator: Generate MD5 checksums
+    - package_validator: Validate final packages
+    - zip_packager: Create HathiTrust-compliant ZIP archives
+    - collect_metadata: Gather volume metadata
+"""
+
+__version__ = '0.1.0'
+__author__ = 'HathiTrust Digitization Team'
diff --git a/checksum_generator.py b/src/checksum_generator.py
similarity index 93%
rename from checksum_generator.py
rename to src/checksum_generator.py
index 00b0279..02ab0db 100644
--- a/checksum_generator.py
+++ b/src/checksum_generator.py
@@ -43,6 +43,20 @@ def compute_md5(self, file_path: str) -> str:
         
         return md5_hasher.hexdigest()
     
+    def compute_md5_from_bytes(self, data: bytes) -> str:
+        """
+        Calculate MD5 hash of byte data.
+        
+        Args:
+            data: Byte data to hash
+            
+        Returns:
+            MD5 hash as lowercase hexadecimal string
+        """
+        md5_hasher = hashlib.md5()
+        md5_hasher.update(data)
+        return md5_hasher.hexdigest()
+    
     def generate_checksums(self, package_directory: str, output_file: str = "checksum.md5") -> Dict:
         """
         Generate checksum.md5 file for all files in package directory.
diff --git a/collect_metadata.py b/src/collect_metadata.py
similarity index 100%
rename from collect_metadata.py
rename to src/collect_metadata.py
diff --git a/file_validator.py b/src/file_validator.py
similarity index 100%
rename from file_validator.py
rename to src/file_validator.py
diff --git a/src/main_pipeline.py b/src/main_pipeline.py
new file mode 100644
index 0000000..a42d612
--- /dev/null
+++ b/src/main_pipeline.py
@@ -0,0 +1,724 @@
+#!/usr/bin/env python3
+"""
+HathiTrust Package Automation Pipeline - Main Orchestration Module
+
+This module orchestrates the complete processing pipeline for creating
+HathiTrust-compliant submission packages from TIFF images.
+
+Pipeline Stages:
+1. Volume Discovery - Identify and group TIFF files by volume identifier
+2. OCR Processing - Generate plain text and coordinate OCR
+3. File Validation - Standardize naming and verify triplets
+4. YAML Generation - Create meta.yml metadata files
+5. Checksum Generation - Compute MD5 hashes
+6. Package Assembly - Organize into flat directory structure
+7. ZIP Creation - Create compliant ZIP archives
+8. Package Validation - Verify HathiTrust compliance
+
+Usage:
+    python main_pipeline.py                    # Process all volumes
+    python main_pipeline.py --volume-id ID     # Process single volume
+    python main_pipeline.py --resume           # Skip existing valid ZIPs
+    python main_pipeline.py --keep-temp        # Keep working directories
+"""
+
+import argparse
+import csv
+import json
+import logging
+import shutil
+import time
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+import yaml
+
+from tqdm import tqdm
+
+# Import pipeline modules
+from volume_discovery import discover_volumes, VolumeGroup
+from ocr_processor import OCRProcessor
+from file_validator import FileValidator
+from yaml_generator import YAMLGenerator
+from checksum_generator import ChecksumGenerator
+from package_assembler import PackageAssembler
+from zip_packager import ZIPPackager
+from package_validator import PackageValidator
+
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PipelineConfig:
+    """Configuration for pipeline execution."""
+    input_dir: Path
+    output_dir: Path
+    temp_dir: Path
+    logs_dir: Path
+    config_path: Path
+    ocr_language: str = 'eng'
+    resume_mode: bool = False
+    keep_temp: bool = False
+    verbose: bool = False
+    volume_id: Optional[str] = None
+
+
+@dataclass
+class VolumeResult:
+    """Result of processing a single volume."""
+    volume_id: str
+    status: str  # 'SUCCESS' or 'FAILED'
+    failed_stage: Optional[str] = None
+    error_message: Optional[str] = None
+    output_zip_path: Optional[Path] = None
+    processing_time: float = 0.0
+    validation_report: Optional[Dict[str, Any]] = None
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for reporting."""
+        return {
+            'volume_id': self.volume_id,
+            'status': self.status,
+            'failed_stage': self.failed_stage,
+            'error_message': self.error_message,
+            'output_path': str(self.output_zip_path) if self.output_zip_path else None,
+            'processing_time_seconds': round(self.processing_time, 2),
+            'validation_report': self.validation_report
+        }
+
+
+@dataclass
+class ProcessingResults:
+    """Aggregate results for batch processing."""
+    successful_volumes: List[VolumeResult] = field(default_factory=list)
+    failed_volumes: List[VolumeResult] = field(default_factory=list)
+    total_processing_time: float = 0.0
+    report_path: Optional[Path] = None
+    
+    @property
+    def total_volumes(self) -> int:
+        return len(self.successful_volumes) + len(self.failed_volumes)
+    
+    @property
+    def success_rate(self) -> float:
+        if self.total_volumes == 0:
+            return 0.0
+        return len(self.successful_volumes) / self.total_volumes * 100
+
+
+def load_configuration(args: argparse.Namespace) -> PipelineConfig:
+    """Load configuration from config file and command-line arguments."""
+    # Load config.yaml if it exists
+    config_path = Path(args.config)
+    config_data = {}
+    
+    if config_path.exists():
+        with open(config_path, 'r') as f:
+            config_data = yaml.safe_load(f) or {}
+    
+    # Extract paths from config with defaults
+    paths = config_data.get('paths', {})
+    input_dir = Path(args.input_dir) if args.input_dir else Path(paths.get('input_dir', 'input/'))
+    output_dir = Path(args.output_dir) if args.output_dir else Path(paths.get('output_dir', 'output/'))
+    temp_dir = Path(paths.get('temp_dir', 'temp/'))
+    logs_dir = Path(paths.get('logs_dir', 'logs/'))
+    
+    # OCR settings
+    ocr_config = config_data.get('ocr', {})
+    ocr_language = ocr_config.get('language', 'eng')
+    
+    return PipelineConfig(
+        input_dir=input_dir,
+        output_dir=output_dir,
+        temp_dir=temp_dir,
+        logs_dir=logs_dir,
+        config_path=config_path,
+        ocr_language=ocr_language,
+        resume_mode=args.resume,
+        keep_temp=args.keep_temp,
+        verbose=args.verbose,
+        volume_id=args.volume_id
+    )
+
+
+def setup_logging(config: PipelineConfig) -> None:
+    """Configure logging to file and console."""
+    # Create logs directory
+    config.logs_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Create timestamped log file
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    log_file = config.logs_dir / f'pipeline_{timestamp}.log'
+    
+    # Configure file handler
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    ))
+    
+    # Configure console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.DEBUG if config.verbose else logging.INFO)
+    console_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
+    
+    # Add handlers to root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.DEBUG)
+    root_logger.handlers.clear()
+    root_logger.addHandler(file_handler)
+    root_logger.addHandler(console_handler)
+    
+    logger.info(f"Logging to: {log_file}")
+
+
+def check_metadata_file(volume_id: str) -> Path:
+    """
+    Check if metadata JSON file exists for the volume.
+    
+    Args:
+        volume_id: Volume identifier
+        
+    Returns:
+        Path to metadata file
+        
+    Raises:
+        FileNotFoundError: If metadata file doesn't exist
+    """
+    metadata_file = Path(f"metadata_{volume_id}.json")
+    
+    if not metadata_file.exists():
+        raise FileNotFoundError(
+            f"Metadata file not found: {metadata_file}\n"
+            f"Run: python collect_metadata.py {volume_id}"
+        )
+    
+    return metadata_file
+
+
+def check_existing_package(volume_id: str, output_dir: Path) -> Optional[Path]:
+    """
+    Check if a valid ZIP package already exists for this volume.
+    
+    Args:
+        volume_id: Volume identifier
+        output_dir: Output directory path
+        
+    Returns:
+        Path to existing valid ZIP, or None if doesn't exist or invalid
+    """
+    zip_path = output_dir / f"{volume_id}.zip"
+    
+    if not zip_path.exists():
+        return None
+    
+    # Quick validation check
+    try:
+        validator = PackageValidator(zip_path)
+        report = validator.validate_package()
+        if report.is_valid:
+            return zip_path
+    except Exception:
+        pass
+    
+    return None
+
+
+def process_volume(
+    volume_id: str,
+    volume_group: VolumeGroup,
+    config: PipelineConfig
+) -> VolumeResult:
+    """
+    Process a single volume through the complete pipeline.
+    
+    Args:
+        volume_id: Volume identifier
+        volume_group: VolumeGroup with TIFF files
+        config: Pipeline configuration
+        
+    Returns:
+        VolumeResult with processing outcome
+    """
+    start_time = time.time()
+    current_stage = "initialization"
+    
+    logger.info(f"Starting processing for volume: {volume_id}")
+    
+    try:
+        # Stage 1: Check metadata file exists
+        current_stage = "metadata_check"
+        logger.debug(f"[{volume_id}] Checking metadata file...")
+        metadata_path = check_metadata_file(volume_id)
+        
+        # Stage 2: Create working directory
+        current_stage = "setup"
+        work_dir = config.temp_dir / volume_id
+        work_dir.mkdir(parents=True, exist_ok=True)
+        logger.debug(f"[{volume_id}] Working directory: {work_dir}")
+        
+        # Create output directories
+        text_dir = work_dir / "text"
+        hocr_dir = work_dir / "hocr"
+        package_dir = work_dir / "package"
+        text_dir.mkdir(exist_ok=True)
+        hocr_dir.mkdir(exist_ok=True)
+        package_dir.mkdir(exist_ok=True)
+        
+        # Stage 3: OCR Processing
+        current_stage = "ocr_processing"
+        logger.info(f"[{volume_id}] Running OCR processing...")
+        ocr_processor = OCRProcessor(language=config.ocr_language)
+        
+        ocr_results = []
+        for tiff_file in tqdm(volume_group.tiff_files, 
+                             desc=f"OCR {volume_id}", 
+                             disable=not config.verbose):
+            result = ocr_processor.process_single_file(
+                tiff_path=tiff_file,
+                output_dir=work_dir
+            )
+            ocr_results.append(result)
+            
+            # Check for OCR errors
+            if not result.success:
+                logger.warning(f"[{volume_id}] OCR failed for {tiff_file.name}: {result.error}")
+        
+        # Check if we have enough successful OCR results
+        successful_ocr = [r for r in ocr_results if r.success]
+        if len(successful_ocr) == 0:
+            raise RuntimeError("All OCR processing failed")
+        
+        # Stage 4: File Validation
+        current_stage = "file_validation"
+        logger.info(f"[{volume_id}] Validating files...")
+        validator = FileValidator()
+        
+        # Collect all files (TIFFs are still in input dir, OCR outputs in work_dir)
+        tiff_files = volume_group.tiff_files  # Original TIFFs from input directory
+        txt_files = sorted(work_dir.glob("*.txt"))
+        html_files = sorted(work_dir.glob("*.html"))
+        
+        # Verify sequential naming
+        if not validator.verify_sequential_naming(tiff_files):
+            raise ValueError("TIFF files have gaps in sequential numbering")
+        
+        # Verify matching triplets
+        if not validator.verify_matching_triplets(tiff_files, txt_files, html_files):
+            raise ValueError("File triplets don't match (TIFF/TXT/HTML)")
+        
+        # Stage 5: YAML Generation
+        current_stage = "yaml_generation"
+        logger.info(f"[{volume_id}] Generating YAML metadata...")
+        yaml_gen = YAMLGenerator()
+        
+        # Generate meta.yml
+        meta_yml_path = yaml_gen.generate_from_volume(
+            volume_id=volume_id,
+            metadata_json=metadata_path,
+            tiff_files=tiff_files,
+            output_dir=work_dir
+        )
+        logger.debug(f"[{volume_id}] Generated: {meta_yml_path}")
+        
+        # Stage 6: Package Assembly
+        current_stage = "package_assembly"
+        logger.info(f"[{volume_id}] Assembling package...")
+        assembler = PackageAssembler(output_base_dir=work_dir)
+        
+        # Debug: Log what files we're passing
+        logger.debug(f"TIFF files to copy ({len(tiff_files)}): {[f.name for f in tiff_files[:3]]}")
+        logger.debug(f"TXT files to copy ({len(txt_files)}): {[f.name for f in txt_files[:3]]}")
+        logger.debug(f"HTML files to copy ({len(html_files)}): {[f.name for f in html_files[:3]]}")
+        
+        package_dir = assembler.assemble_package(
+            volume_id=volume_id,
+            tiff_files=tiff_files,
+            text_files=txt_files,
+            hocr_files=html_files,
+            meta_yml=meta_yml_path
+        )
+        logger.debug(f"[{volume_id}] Package assembled: {package_dir}")
+        
+        # Stage 7: ZIP Creation
+        current_stage = "zip_creation"
+        logger.info(f"[{volume_id}] Creating ZIP archive...")
+        
+        # Ensure output directory exists
+        config.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        packager = ZIPPackager(output_dir=config.output_dir)
+        zip_path = packager.create_zip_archive(
+            package_dir=package_dir,
+            volume_id=volume_id
+        )
+        logger.debug(f"[{volume_id}] ZIP created: {zip_path}")
+        
+        # Stage 8: Package Validation
+        current_stage = "validation"
+        logger.info(f"[{volume_id}] Validating package...")
+        
+        pkg_validator = PackageValidator()
+        validation_report = pkg_validator.validate_package(zip_path)
+        
+        if not validation_report.is_valid:
+            error_summary = "\n".join([f"  - {e}" for e in validation_report.errors])
+            logger.error(f"[{volume_id}] Validation failed:\n{error_summary}")
+            raise ValueError(f"Package validation failed: {len(validation_report.errors)} errors")
+        
+        logger.info(f"[{volume_id}] Validation passed ✓")
+        
+        # Stage 9: Cleanup
+        current_stage = "cleanup"
+        processing_time = time.time() - start_time
+        
+        if not config.keep_temp:
+            logger.debug(f"[{volume_id}] Cleaning up temp directory...")
+            shutil.rmtree(work_dir)
+        else:
+            logger.debug(f"[{volume_id}] Keeping temp directory: {work_dir}")
+        
+        logger.info(f"[{volume_id}] ✓ SUCCESS - Completed in {processing_time:.1f}s")
+        
+        return VolumeResult(
+            volume_id=volume_id,
+            status='SUCCESS',
+            output_zip_path=zip_path,
+            processing_time=processing_time,
+            validation_report=validation_report.to_dict()
+        )
+        
+    except Exception as e:
+        processing_time = time.time() - start_time
+        logger.exception(f"[{volume_id}] ✗ FAILED at stage '{current_stage}'")
+        
+        return VolumeResult(
+            volume_id=volume_id,
+            status='FAILED',
+            failed_stage=current_stage,
+            error_message=str(e),
+            processing_time=processing_time
+        )
+
+
+def main_pipeline(config: PipelineConfig) -> ProcessingResults:
+    """
+    Execute the complete HathiTrust package creation pipeline.
+    
+    Args:
+        config: Pipeline configuration
+        
+    Returns:
+        ProcessingResults with batch outcomes
+    """
+    pipeline_start = time.time()
+    
+    logger.info("="*80)
+    logger.info("HathiTrust Package Automation Pipeline")
+    logger.info("="*80)
+    logger.info(f"Input directory: {config.input_dir}")
+    logger.info(f"Output directory: {config.output_dir}")
+    logger.info(f"Resume mode: {config.resume_mode}")
+    logger.info(f"Keep temp files: {config.keep_temp}")
+    logger.info("")
+    
+    # Step 1: Discover volumes
+    logger.info("Step 1: Discovering volumes...")
+    
+    try:
+        volume_groups = discover_volumes(config.input_dir)
+    except Exception as e:
+        logger.error(f"Failed to discover volumes: {e}")
+        return ProcessingResults()
+    
+    if not volume_groups:
+        logger.warning("No volumes found in input directory")
+        return ProcessingResults()
+    
+    logger.info(f"Found {len(volume_groups)} volume(s)")
+    
+    # Filter to single volume if specified
+    if config.volume_id:
+        if config.volume_id in volume_groups:
+            volume_groups = {config.volume_id: volume_groups[config.volume_id]}
+            logger.info(f"Processing single volume: {config.volume_id}")
+        else:
+            logger.error(f"Volume not found: {config.volume_id}")
+            return ProcessingResults()
+    
+    # Step 2: Filter already-processed volumes if in resume mode
+    volumes_to_process = {}
+    
+    if config.resume_mode:
+        logger.info("Checking for existing valid packages...")
+        for volume_id, volume_group in volume_groups.items():
+            existing = check_existing_package(volume_id, config.output_dir)
+            if existing:
+                logger.info(f"  ↷ Skipping {volume_id} (valid package exists)")
+            else:
+                volumes_to_process[volume_id] = volume_group
+        
+        logger.info(f"Resume mode: {len(volumes_to_process)} volume(s) to process "
+                   f"({len(volume_groups) - len(volumes_to_process)} skipped)")
+    else:
+        volumes_to_process = volume_groups
+    
+    if not volumes_to_process:
+        logger.info("All volumes already processed!")
+        return ProcessingResults()
+    
+    # Step 3: Process each volume
+    logger.info("")
+    logger.info(f"Step 2: Processing {len(volumes_to_process)} volume(s)...")
+    logger.info("-"*80)
+    
+    results = ProcessingResults()
+    
+    # Process with progress bar
+    with tqdm(volumes_to_process.items(), 
+             desc="Overall Progress",
+             unit="volume",
+             disable=config.verbose) as pbar:
+        
+        for volume_id, volume_group in pbar:
+            pbar.set_description(f"Processing {volume_id}")
+            
+            result = process_volume(volume_id, volume_group, config)
+            
+            if result.status == 'SUCCESS':
+                results.successful_volumes.append(result)
+            else:
+                results.failed_volumes.append(result)
+    
+    # Calculate total time
+    results.total_processing_time = time.time() - pipeline_start
+    
+    # Step 4: Generate reports
+    logger.info("")
+    logger.info("-"*80)
+    logger.info("Step 3: Generating reports...")
+    
+    try:
+        results.report_path = generate_reports(results, config)
+        logger.info(f"Reports generated: {results.report_path.parent}")
+    except Exception as e:
+        logger.error(f"Failed to generate reports: {e}")
+    
+    return results
+
+
+def generate_reports(results: ProcessingResults, config: PipelineConfig) -> Path:
+    """
+    Generate CSV and JSON processing reports.
+    
+    Args:
+        results: Processing results
+        config: Pipeline configuration
+        
+    Returns:
+        Path to CSV report file
+    """
+    # Create logs directory if it doesn't exist
+    config.logs_dir.mkdir(parents=True, exist_ok=True)
+    
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    
+    # Generate CSV report
+    csv_path = config.logs_dir / f'processing_report_{timestamp}.csv'
+    
+    with open(csv_path, 'w', newline='') as f:
+        fieldnames = [
+            'volume_id', 'status', 'failed_stage', 'error_message',
+            'output_path', 'processing_time_seconds'
+        ]
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        
+        for result in results.successful_volumes + results.failed_volumes:
+            row = result.to_dict()
+            # Remove validation_report from CSV (too detailed)
+            row.pop('validation_report', None)
+            writer.writerow(row)
+    
+    logger.info(f"  CSV report: {csv_path}")
+    
+    # Generate JSON report (detailed)
+    json_path = config.logs_dir / f'processing_report_{timestamp}.json'
+    
+    report_data = {
+        'summary': {
+            'timestamp': timestamp,
+            'total_volumes': results.total_volumes,
+            'successful': len(results.successful_volumes),
+            'failed': len(results.failed_volumes),
+            'success_rate': round(results.success_rate, 2),
+            'total_processing_time_seconds': round(results.total_processing_time, 2)
+        },
+        'volumes': [v.to_dict() for v in results.successful_volumes + results.failed_volumes]
+    }
+    
+    with open(json_path, 'w') as f:
+        json.dump(report_data, f, indent=2)
+    
+    logger.info(f"  JSON report: {json_path}")
+    
+    return csv_path
+
+
+def print_summary(results: ProcessingResults) -> None:
+    """Print processing summary to console."""
+    print("\n" + "="*80)
+    print("PROCESSING SUMMARY")
+    print("="*80)
+    print(f"Total volumes:    {results.total_volumes}")
+    print(f"Successful:       {len(results.successful_volumes)} ({results.success_rate:.1f}%)")
+    print(f"Failed:           {len(results.failed_volumes)}")
+    print(f"Processing time:  {results.total_processing_time:.1f}s")
+    
+    if results.successful_volumes:
+        print("\n✓ SUCCESSFUL VOLUMES:")
+        for result in results.successful_volumes:
+            print(f"  • {result.volume_id} ({result.processing_time:.1f}s)")
+            if result.output_zip_path:
+                print(f"    → {result.output_zip_path}")
+    
+    if results.failed_volumes:
+        print("\n✗ FAILED VOLUMES:")
+        for result in results.failed_volumes:
+            print(f"  • {result.volume_id} - Failed at: {result.failed_stage}")
+            print(f"    Error: {result.error_message}")
+    
+    if results.report_path:
+        print(f"\n📄 Reports: {results.report_path.parent}")
+    
+    print("="*80 + "\n")
+
+
+def main():
+    """Main entry point with CLI argument parsing."""
+    parser = argparse.ArgumentParser(
+        description='HathiTrust Package Automation Pipeline',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Process all volumes in input/
+  python main_pipeline.py
+  
+  # Process a single volume
+  python main_pipeline.py --volume-id 39015012345678
+  
+  # Resume interrupted batch (skip existing valid ZIPs)
+  python main_pipeline.py --resume
+  
+  # Custom directories
+  python main_pipeline.py --input-dir /path/to/tiffs --output-dir /path/to/zips
+  
+  # Debug mode (keep temp files, verbose output)
+  python main_pipeline.py --keep-temp --verbose
+        """
+    )
+    
+    parser.add_argument(
+        '--input-dir',
+        type=str,
+        help='Input directory containing TIFF files (default: from config.yaml)'
+    )
+    
+    parser.add_argument(
+        '--output-dir',
+        type=str,
+        help='Output directory for ZIP packages (default: from config.yaml)'
+    )
+    
+    parser.add_argument(
+        '--config',
+        type=str,
+        default='config.yaml',
+        help='Path to configuration file (default: config.yaml)'
+    )
+    
+    parser.add_argument(
+        '--volume-id',
+        type=str,
+        help='Process only this volume identifier'
+    )
+    
+    parser.add_argument(
+        '--resume',
+        action='store_true',
+        help='Skip volumes with existing valid ZIP packages'
+    )
+    
+    parser.add_argument(
+        '--keep-temp',
+        action='store_true',
+        help='Keep temporary working directories (for debugging)'
+    )
+    
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Validate configuration without processing'
+    )
+    
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Enable detailed console output'
+    )
+    
+    args = parser.parse_args()
+    
+    # Load configuration
+    try:
+        config = load_configuration(args)
+    except Exception as e:
+        print(f"ERROR: Failed to load configuration: {e}")
+        return 1
+    
+    # Setup logging
+    setup_logging(config)
+    
+    # Dry-run mode
+    if args.dry_run:
+        print("Dry-run mode: Configuration validated successfully")
+        print(f"  Input:  {config.input_dir}")
+        print(f"  Output: {config.output_dir}")
+        print(f"  Temp:   {config.temp_dir}")
+        print(f"  Logs:   {config.logs_dir}")
+        return 0
+    
+    # Verify input directory exists
+    if not config.input_dir.exists():
+        logger.error(f"Input directory not found: {config.input_dir}")
+        return 1
+    
+    # Execute pipeline
+    try:
+        results = main_pipeline(config)
+        
+        # Print summary
+        print_summary(results)
+        
+        # Return exit code based on results
+        if results.failed_volumes:
+            return 1
+        return 0
+        
+    except KeyboardInterrupt:
+        logger.warning("\nProcessing interrupted by user")
+        return 130
+    except Exception as e:
+        logger.exception("Pipeline execution failed")
+        return 1
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/ocr_processor.py b/src/ocr_processor.py
similarity index 99%
rename from ocr_processor.py
rename to src/ocr_processor.py
index b2d7a2e..67a9c00 100755
--- a/ocr_processor.py
+++ b/src/ocr_processor.py
@@ -245,7 +245,7 @@ def process_volume(self, tiff_files: List[Path], output_dir: Path) -> Dict[str,
 # Demo/Testing functionality
 if __name__ == "__main__":
     import argparse
-    from volume_discovery import discover_volumes
+    from src.volume_discovery import discover_volumes
     
     logging.basicConfig(
         level=logging.INFO,
diff --git a/src/package_assembler.py b/src/package_assembler.py
new file mode 100644
index 0000000..d1f534a
--- /dev/null
+++ b/src/package_assembler.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+"""
+Step 7: Package Assembly
+Assembles HathiTrust submission packages from processed files.
+
+This module organizes TIFF images, OCR outputs, and metadata into a flat
+directory structure compliant with HathiTrust SIP requirements.
+"""
+
+import logging
+import shutil
+from pathlib import Path
+from typing import List, Optional, Dict
+from dataclasses import dataclass
+
+# Import from previous steps
+from src.checksum_generator import ChecksumGenerator
+from src.file_validator import FileValidator
+
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PackageValidationResult:
+    """Result of package structure validation"""
+    is_valid: bool
+    package_dir: Path
+    errors: List[str]
+    warnings: List[str]
+    files_copied: List[Path]
+    total_files: int
+
+
+class PackageAssembler:
+    """Assembles HathiTrust submission packages from processed files"""
+    
+    def __init__(self, output_base_dir: Path):
+        """
+        Initialize PackageAssembler.
+        
+        Args:
+            output_base_dir: Base directory where packages will be created
+        """
+        self.output_base_dir = Path(output_base_dir)
+        self.output_base_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"PackageAssembler initialized with output: {self.output_base_dir}")
+    
+    def assemble_package(
+        self,
+        volume_id: str,
+        tiff_files: List[Path],
+        text_files: List[Path],
+        hocr_files: List[Path],
+        meta_yml: Path,
+        checksum_md5: Optional[Path] = None,
+        generate_checksum: bool = True
+    ) -> Path:
+        """
+        Assemble a complete HathiTrust submission package.
+        
+        Creates a flat directory structure containing:
+        - TIFF images (00000001.tif, 00000002.tif, ...)
+        - Plain text OCR (00000001.txt, 00000002.txt, ...)
+        - Coordinate OCR (00000001.html, 00000002.html, ...)
+        - meta.yml metadata file
+        - checksum.md5 fixity file
+        
+        Args:
+            volume_id: Volume identifier (barcode or ARK)
+            tiff_files: List of TIFF image files
+            text_files: List of plain text OCR files
+            hocr_files: List of hOCR coordinate files
+            meta_yml: Path to meta.yml metadata file
+            checksum_md5: Optional pre-existing checksum file
+            generate_checksum: Generate checksum.md5 if True
+            
+        Returns:
+            Path to assembled package directory
+            
+        Raises:
+            ValueError: If validation fails
+        """
+        logger.info(f"Assembling package for volume: {volume_id}")
+        
+        # Create package directory
+        package_dir = self.output_base_dir / volume_id
+        package_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Created package directory: {package_dir}")
+        
+        # Copy TIFF files
+        logger.info(f"Copying {len(tiff_files)} TIFF files...")
+        self.copy_files_to_package(tiff_files, package_dir)
+        
+        # Copy text OCR files
+        logger.info(f"Copying {len(text_files)} text OCR files...")
+        self.copy_files_to_package(text_files, package_dir)
+        
+        # Copy hOCR files
+        logger.info(f"Copying {len(hocr_files)} hOCR files...")
+        self.copy_files_to_package(hocr_files, package_dir)
+        
+        # Copy meta.yml
+        if not meta_yml.exists():
+            raise ValueError(f"meta.yml not found: {meta_yml}")
+        logger.info(f"Copying meta.yml...")
+        shutil.copy2(meta_yml, package_dir / "meta.yml")
+        
+        # Handle checksum.md5
+        if checksum_md5 and checksum_md5.exists():
+            logger.info("Copying existing checksum.md5...")
+            shutil.copy2(checksum_md5, package_dir / "checksum.md5")
+        elif generate_checksum:
+            logger.info("Generating checksum.md5...")
+            checksum_gen = ChecksumGenerator()
+            checksum_gen.generate_checksums(package_dir)
+        
+        # Validate package structure
+        logger.info("Validating package structure...")
+        validation = self.validate_package_structure(package_dir)
+        
+        if not validation.is_valid:
+            error_msg = f"Package validation failed:\n" + "\n".join(validation.errors)
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+        
+        if validation.warnings:
+            for warning in validation.warnings:
+                logger.warning(warning)
+        
+        logger.info(f"✓ Successfully assembled package: {package_dir}")
+        logger.info(f"  Total files: {validation.total_files}")
+        return package_dir
+    
+    def copy_files_to_package(
+        self,
+        source_files: List[Path],
+        package_dir: Path
+    ) -> List[Path]:
+        """
+        Copy files to package directory.
+        
+        Args:
+            source_files: List of source file paths
+            package_dir: Destination package directory
+            
+        Returns:
+            List of destination file paths
+        """
+        copied_files = []
+        
+        for source_file in source_files:
+            if not source_file.exists():
+                logger.warning(f"Source file not found, skipping: {source_file}")
+                continue
+            
+            # Extract 8-digit sequence from filename if it has volume identifier prefix
+            # Pattern: <identifier>_00000001.tif -> 00000001.tif
+            filename = source_file.name
+            if '_' in filename:
+                parts = filename.rsplit('_', 1)
+                if len(parts) == 2 and parts[1][:8].isdigit():
+                    # Has volume identifier prefix, use only the sequence part
+                    filename = parts[1]
+            
+            dest_file = package_dir / filename
+            shutil.copy2(source_file, dest_file)
+            copied_files.append(dest_file)
+            logger.debug(f"Copied: {source_file.name} -> {filename}")
+        
+        return copied_files
+    
+    def validate_package_structure(
+        self,
+        package_dir: Path
+    ) -> PackageValidationResult:
+        """
+        Validate package meets HathiTrust requirements.
+        
+        Checks:
+        - Flat structure (no subdirectories)
+        - Required files present (meta.yml, checksum.md5)
+        - Matching triplets (TIFF/TXT/HTML)
+        - Sequential numbering with no gaps
+        
+        Args:
+            package_dir: Package directory to validate
+            
+        Returns:
+            PackageValidationResult with validation status
+        """
+        errors = []
+        warnings = []
+        
+        if not package_dir.exists():
+            errors.append(f"Package directory does not exist: {package_dir}")
+            return PackageValidationResult(
+                is_valid=False,
+                package_dir=package_dir,
+                errors=errors,
+                warnings=warnings,
+                files_copied=[],
+                total_files=0
+            )
+        
+        # Get all files in package
+        all_files = list(package_dir.iterdir())
+        
+        # Check 1: No subdirectories
+        subdirs = [f for f in all_files if f.is_dir()]
+        if subdirs:
+            errors.append(f"Subdirectories found (not allowed): {[d.name for d in subdirs]}")
+        
+        # Get only files (no directories)
+        files = [f for f in all_files if f.is_file()]
+        
+        # Check 2: Required files present
+        file_names = {f.name for f in files}
+        if "meta.yml" not in file_names:
+            errors.append("Required file missing: meta.yml")
+        if "checksum.md5" not in file_names:
+            warnings.append("checksum.md5 not found (will be generated later)")
+        
+        # Check 3: Extract file types
+        tiff_files = sorted([f for f in files if f.suffix.lower() in ['.tif', '.tiff']])
+        txt_files = sorted([f for f in files if f.suffix.lower() == '.txt'])
+        html_files = sorted([f for f in files if f.suffix.lower() == '.html'])
+        
+        # Check 4: Matching triplets (TIFF/TXT/HTML)
+        if tiff_files:
+            tiff_basenames = {f.stem for f in tiff_files}
+            txt_basenames = {f.stem for f in txt_files} if txt_files else set()
+            html_basenames = {f.stem for f in html_files} if html_files else set()
+            
+            if tiff_basenames != txt_basenames:
+                missing_txt = tiff_basenames - txt_basenames
+                extra_txt = txt_basenames - tiff_basenames
+                if missing_txt:
+                    errors.append(f"TIFFs missing corresponding TXT files: {missing_txt}")
+                if extra_txt:
+                    warnings.append(f"Extra TXT files without TIFFs: {extra_txt}")
+            
+            if tiff_basenames != html_basenames:
+                missing_html = tiff_basenames - html_basenames
+                extra_html = html_basenames - tiff_basenames
+                if missing_html:
+                    errors.append(f"TIFFs missing corresponding HTML files: {missing_html}")
+                if extra_html:
+                    warnings.append(f"Extra HTML files without TIFFs: {extra_html}")
+        
+        # Check 5: Sequential numbering (use FileValidator)
+        if tiff_files:
+            validator = FileValidator(str(package_dir))
+            
+            # Extract sequence numbers
+            sequence_numbers = []
+            for tiff_file in tiff_files:
+                try:
+                    seq_num = int(tiff_file.stem)
+                    sequence_numbers.append(seq_num)
+                except ValueError:
+                    errors.append(f"Invalid filename (not 8-digit number): {tiff_file.name}")
+            
+            # Check for gaps in sequence
+            if sequence_numbers:
+                sequence_numbers.sort()
+                expected = list(range(1, len(sequence_numbers) + 1))
+                if sequence_numbers != expected:
+                    errors.append(f"Non-sequential numbering detected")
+                    missing = set(expected) - set(sequence_numbers)
+                    if missing:
+                        errors.append(f"Missing sequence numbers: {sorted(missing)}")
+        
+        # Return validation result
+        is_valid = len(errors) == 0
+        
+        return PackageValidationResult(
+            is_valid=is_valid,
+            package_dir=package_dir,
+            errors=errors,
+            warnings=warnings,
+            files_copied=files,
+            total_files=len(files)
+        )
+
+
+
+def main():
+    """Command-line interface for package assembly"""
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description="Assemble HathiTrust submission packages from processed files"
+    )
+    parser.add_argument(
+        "volume_id",
+        help="Volume identifier (barcode or ARK)"
+    )
+    parser.add_argument(
+        "--tiff-dir",
+        type=Path,
+        required=True,
+        help="Directory containing TIFF files"
+    )
+    parser.add_argument(
+        "--ocr-dir",
+        type=Path,
+        required=True,
+        help="Directory containing OCR output (TXT and HTML files)"
+    )
+    parser.add_argument(
+        "--meta-yml",
+        type=Path,
+        required=True,
+        help="Path to meta.yml metadata file"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("output"),
+        help="Output directory for assembled packages (default: output/)"
+    )
+    parser.add_argument(
+        "--generate-checksum",
+        action="store_true",
+        help="Generate checksum.md5 file"
+    )
+    parser.add_argument(
+        "--checksum-file",
+        type=Path,
+        help="Path to existing checksum.md5 file"
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate input directories
+    if not args.tiff_dir.exists():
+        logger.error(f"TIFF directory not found: {args.tiff_dir}")
+        return 1
+    
+    if not args.ocr_dir.exists():
+        logger.error(f"OCR directory not found: {args.ocr_dir}")
+        return 1
+    
+    if not args.meta_yml.exists():
+        logger.error(f"meta.yml not found: {args.meta_yml}")
+        return 1
+    
+    # Gather files
+    tiff_files = sorted(args.tiff_dir.glob("*.tif")) + sorted(args.tiff_dir.glob("*.tiff"))
+    txt_files = sorted(args.ocr_dir.glob("*.txt"))
+    html_files = sorted(args.ocr_dir.glob("*.html"))
+    
+    logger.info(f"Found {len(tiff_files)} TIFF files")
+    logger.info(f"Found {len(txt_files)} TXT files")
+    logger.info(f"Found {len(html_files)} HTML files")
+    
+    # Create assembler and assemble package
+    assembler = PackageAssembler(args.output_dir)
+    
+    try:
+        package_dir = assembler.assemble_package(
+            volume_id=args.volume_id,
+            tiff_files=tiff_files,
+            text_files=txt_files,
+            hocr_files=html_files,
+            meta_yml=args.meta_yml,
+            checksum_md5=args.checksum_file,
+            generate_checksum=args.generate_checksum
+        )
+        
+        logger.info(f"✓ Package assembled successfully: {package_dir}")
+        return 0
+        
+    except Exception as e:
+        logger.error(f"✗ Failed to assemble package: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())
diff --git a/src/package_validator.py b/src/package_validator.py
new file mode 100644
index 0000000..f279512
--- /dev/null
+++ b/src/package_validator.py
@@ -0,0 +1,584 @@
+#!/usr/bin/env python3
+"""
+Step 9: Quality Control & Validation
+Comprehensive HathiTrust package validation and compliance checking.
+
+This module performs thorough validation of HathiTrust submission packages
+to ensure they meet all technical requirements before submission.
+"""
+
+import logging
+import re
+import zipfile
+import yaml
+from pathlib import Path
+from typing import List, Dict, Optional, Set
+from dataclasses import dataclass, field
+
+from src.zip_packager import ZIPPackager
+from src.checksum_generator import ChecksumGenerator
+
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ValidationReport:
+    """Comprehensive validation report for HathiTrust package"""
+    package_path: Path
+    is_valid: bool
+    
+    # Overall status
+    total_checks: int = 0
+    passed_checks: int = 0
+    failed_checks: int = 0
+    
+    # Check categories
+    naming_checks: List[str] = field(default_factory=list)
+    structure_checks: List[str] = field(default_factory=list)
+    content_checks: List[str] = field(default_factory=list)
+    metadata_checks: List[str] = field(default_factory=list)
+    integrity_checks: List[str] = field(default_factory=list)
+    
+    # Issues found
+    errors: List[str] = field(default_factory=list)
+    warnings: List[str] = field(default_factory=list)
+    
+    # Package details
+    volume_id: Optional[str] = None
+    file_count: int = 0
+    tiff_count: int = 0
+    has_required_files: bool = False
+    has_valid_triplets: bool = False
+    has_valid_yaml: bool = False
+    has_valid_checksums: bool = False
+    
+    def add_pass(self, category: str, message: str):
+        """Record a passed validation check"""
+        self.total_checks += 1
+        self.passed_checks += 1
+        getattr(self, f"{category}_checks").append(f"✓ {message}")
+    
+    def add_fail(self, category: str, message: str, is_warning: bool = False):
+        """Record a failed validation check"""
+        self.total_checks += 1
+        if is_warning:
+            self.warnings.append(message)
+            getattr(self, f"{category}_checks").append(f"⚠ {message}")
+        else:
+            self.failed_checks += 1
+            self.errors.append(message)
+            getattr(self, f"{category}_checks").append(f"✗ {message}")
+    
+    def get_summary(self) -> str:
+        """Generate human-readable validation summary"""
+        status = "✓ VALID" if self.is_valid else "✗ INVALID"
+        return f"""
+{'='*70}
+HathiTrust Package Validation Report
+{'='*70}
+Package: {self.package_path.name}
+Status: {status}
+
+Summary:
+  Total Checks: {self.total_checks}
+  Passed: {self.passed_checks}
+  Failed: {self.failed_checks}
+  Warnings: {len(self.warnings)}
+
+Package Details:
+  Volume ID: {self.volume_id or 'Unknown'}
+  Total Files: {self.file_count}
+  TIFF Images: {self.tiff_count}
+  Required Files: {'✓' if self.has_required_files else '✗'}
+  Valid Triplets: {'✓' if self.has_valid_triplets else '✗'}
+  Valid YAML: {'✓' if self.has_valid_yaml else '✗'}
+  Valid Checksums: {'✓' if self.has_valid_checksums else '✗'}
+
+{'='*70}
+"""
+    
+    def to_dict(self) -> Dict:
+        """Convert ValidationReport to dictionary for JSON serialization."""
+        return {
+            'package_path': str(self.package_path),
+            'is_valid': self.is_valid,
+            'total_checks': self.total_checks,
+            'passed_checks': self.passed_checks,
+            'failed_checks': self.failed_checks,
+            'errors': self.errors,
+            'warnings': self.warnings,
+            'volume_id': self.volume_id,
+            'file_count': self.file_count,
+            'tiff_count': self.tiff_count,
+            'has_required_files': self.has_required_files,
+            'has_valid_triplets': self.has_valid_triplets,
+            'has_valid_yaml': self.has_valid_yaml,
+            'has_valid_checksums': self.has_valid_checksums
+        }
+
+
+class PackageValidator:
+    """Comprehensive HathiTrust package validation"""
+    
+    # HathiTrust identifier patterns
+    BARCODE_PATTERN = re.compile(r'^\d+$')
+    ARK_PATTERN = re.compile(r'^ark[_:/].+')
+    
+    # Required files
+    REQUIRED_FILES = {'meta.yml', 'checksum.md5'}
+    
+    # File naming pattern (8-digit sequence)
+    SEQUENCE_PATTERN = re.compile(r'^(\d{8})\.(tif|txt|html)$', re.IGNORECASE)
+    
+    def __init__(self):
+        """Initialize package validator"""
+        self.zip_packager = ZIPPackager(Path('.'))
+        self.checksum_generator = ChecksumGenerator()
+    
+    def validate_package(self, zip_path: Path) -> ValidationReport:
+        """
+        Perform comprehensive validation of HathiTrust package.
+        
+        Args:
+            zip_path: Path to ZIP file to validate
+            
+        Returns:
+            ValidationReport with detailed validation results
+        """
+        zip_path = Path(zip_path)
+        report = ValidationReport(package_path=zip_path, is_valid=False)
+        
+        logger.info(f"Validating package: {zip_path.name}")
+        
+        # Check 1: ZIP file exists
+        if not zip_path.exists():
+            report.add_fail('structure', f"ZIP file not found: {zip_path}", is_warning=False)
+            report.is_valid = False
+            return report
+        
+        report.add_pass('structure', f"ZIP file exists: {zip_path.name}")
+        
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                # Get ZIP contents
+                zip_contents = zf.namelist()
+                report.file_count = len(zip_contents)
+                
+                # Check 2: Naming convention
+                self._validate_naming(zip_path, report)
+                
+                # Check 3: ZIP structure (flat, no subdirectories)
+                self._validate_structure(zip_contents, report)
+                
+                # Check 4: Required files present
+                self._validate_required_files(zip_contents, report)
+                
+                # Check 5: File triplets (TIFF/TXT/HTML matching)
+                self._validate_triplets(zip_contents, report)
+                
+                # Check 6: Sequential numbering
+                self._validate_sequential_numbering(zip_contents, report)
+                
+                # Check 7: YAML metadata
+                self._validate_yaml_metadata(zf, report)
+                
+                # Check 8: MD5 checksums
+                self._validate_checksums(zf, zip_contents, report)
+                
+        except zipfile.BadZipFile:
+            report.add_fail('structure', "Invalid ZIP file format", is_warning=False)
+            report.is_valid = False
+            return report
+        except Exception as e:
+            report.add_fail('structure', f"Error reading ZIP: {str(e)}", is_warning=False)
+            report.is_valid = False
+            return report
+        
+        # Final determination
+        report.is_valid = (report.failed_checks == 0)
+        
+        logger.info(f"Validation complete: {'VALID' if report.is_valid else 'INVALID'}")
+        logger.info(f"Passed: {report.passed_checks}/{report.total_checks}")
+        
+        return report
+    
+    def _validate_naming(self, zip_path: Path, report: ValidationReport):
+        """Validate ZIP filename matches HathiTrust identifier conventions"""
+        filename = zip_path.stem  # Remove .zip extension
+        
+        # Extract volume ID
+        report.volume_id = filename
+        
+        # Check if matches barcode or ARK pattern
+        if self.BARCODE_PATTERN.match(filename):
+            report.add_pass('naming', f"Valid barcode identifier: {filename}")
+        elif self.ARK_PATTERN.match(filename):
+            report.add_pass('naming', f"Valid ARK identifier: {filename}")
+        else:
+            report.add_fail('naming', 
+                          f"ZIP filename doesn't match barcode or ARK pattern: {filename}",
+                          is_warning=True)
+    
+    def _validate_structure(self, zip_contents: List[str], report: ValidationReport):
+        """Validate flat structure with no subdirectories"""
+        subdirs_found = []
+        
+        for name in zip_contents:
+            if '/' in name or '\\' in name:
+                subdirs_found.append(name)
+        
+        if subdirs_found:
+            report.add_fail('structure', 
+                          f"Found {len(subdirs_found)} files in subdirectories (must be flat structure)",
+                          is_warning=False)
+            # Show first few examples
+            for subdir in subdirs_found[:3]:
+                report.add_fail('structure', f"  Example: {subdir}", is_warning=False)
+        else:
+            report.add_pass('structure', "ZIP has flat structure (no subdirectories)")
+    
+    def _validate_required_files(self, zip_contents: List[str], report: ValidationReport):
+        """Validate required files (meta.yml, checksum.md5) are present"""
+        zip_set = set(zip_contents)
+        missing_files = self.REQUIRED_FILES - zip_set
+        
+        if missing_files:
+            for missing in missing_files:
+                report.add_fail('content', f"Required file missing: {missing}", is_warning=False)
+            report.has_required_files = False
+        else:
+            report.add_pass('content', f"All required files present: {', '.join(self.REQUIRED_FILES)}")
+            report.has_required_files = True
+    
+    def _validate_triplets(self, zip_contents: List[str], report: ValidationReport):
+        """Validate matching TIFF/TXT/HTML triplets"""
+        # Extract base names by extension
+        tiff_bases = set()
+        txt_bases = set()
+        html_bases = set()
+        
+        for filename in zip_contents:
+            match = self.SEQUENCE_PATTERN.match(filename)
+            if match:
+                seq_num = match.group(1)
+                ext = match.group(2).lower()
+                
+                if ext == 'tif':
+                    tiff_bases.add(seq_num)
+                elif ext == 'txt':
+                    txt_bases.add(seq_num)
+                elif ext == 'html':
+                    html_bases.add(seq_num)
+        
+        report.tiff_count = len(tiff_bases)
+        
+        # Check for missing companions
+        missing_txt = tiff_bases - txt_bases
+        missing_html = tiff_bases - html_bases
+        extra_txt = txt_bases - tiff_bases
+        extra_html = html_bases - tiff_bases
+        
+        if missing_txt:
+            report.add_fail('content', 
+                          f"Found {len(missing_txt)} TIFF files without matching TXT files",
+                          is_warning=False)
+            # Show examples
+            for seq in sorted(missing_txt)[:3]:
+                report.add_fail('content', f"  Missing: {seq}.txt", is_warning=False)
+        
+        if missing_html:
+            report.add_fail('content', 
+                          f"Found {len(missing_html)} TIFF files without matching HTML files",
+                          is_warning=False)
+            # Show examples
+            for seq in sorted(missing_html)[:3]:
+                report.add_fail('content', f"  Missing: {seq}.html", is_warning=False)
+        
+        if extra_txt:
+            report.add_fail('content', 
+                          f"Found {len(extra_txt)} TXT files without matching TIFF files",
+                          is_warning=True)
+        
+        if extra_html:
+            report.add_fail('content', 
+                          f"Found {len(extra_html)} HTML files without matching TIFF files",
+                          is_warning=True)
+        
+        if not (missing_txt or missing_html or extra_txt or extra_html):
+            report.add_pass('content', f"All {len(tiff_bases)} TIFF files have matching TXT and HTML files")
+            report.has_valid_triplets = True
+        else:
+            report.has_valid_triplets = False
+    
+    def _validate_sequential_numbering(self, zip_contents: List[str], report: ValidationReport):
+        """Validate files use sequential 8-digit numbering with no gaps"""
+        # Extract all sequence numbers
+        sequences = set()
+        
+        for filename in zip_contents:
+            match = self.SEQUENCE_PATTERN.match(filename)
+            if match:
+                sequences.add(int(match.group(1)))
+        
+        if not sequences:
+            report.add_fail('content', "No sequentially-numbered files found", is_warning=False)
+            return
+        
+        # Check for gaps in sequence
+        min_seq = min(sequences)
+        max_seq = max(sequences)
+        expected_range = set(range(min_seq, max_seq + 1))
+        
+        missing_seqs = expected_range - sequences
+        
+        if missing_seqs:
+            report.add_fail('content', 
+                          f"Found {len(missing_seqs)} gaps in sequential numbering",
+                          is_warning=False)
+            # Show examples
+            for seq in sorted(missing_seqs)[:5]:
+                report.add_fail('content', f"  Missing sequence: {seq:08d}", is_warning=False)
+        else:
+            report.add_pass('content', 
+                          f"Sequential numbering valid: {min_seq:08d} to {max_seq:08d} ({len(sequences)} sequences)")
+        
+        # Check if starts at 00000001
+        if min_seq != 1:
+            report.add_fail('content', 
+                          f"Sequence doesn't start at 00000001 (starts at {min_seq:08d})",
+                          is_warning=True)
+    
+    def _validate_yaml_metadata(self, zf: zipfile.ZipFile, report: ValidationReport):
+        """Validate meta.yml structure and required fields"""
+        try:
+            yaml_content = zf.read('meta.yml').decode('utf-8')
+            
+            # Parse YAML
+            try:
+                metadata = yaml.safe_load(yaml_content)
+            except yaml.YAMLError as e:
+                report.add_fail('metadata', f"YAML parsing error: {str(e)}", is_warning=False)
+                report.has_valid_yaml = False
+                return
+            
+            # Check required fields
+            required_fields = {
+                'capture_date': 'Capture date',
+                'scanner_user': 'Scanner operator',
+                'pagedata': 'Page data section'
+            }
+            
+            missing_fields = []
+            for field, description in required_fields.items():
+                if field not in metadata:
+                    missing_fields.append(description)
+            
+            if missing_fields:
+                for field in missing_fields:
+                    report.add_fail('metadata', f"Missing required YAML field: {field}", is_warning=False)
+                report.has_valid_yaml = False
+            else:
+                report.add_pass('metadata', "YAML structure valid with all required fields")
+                
+                # Validate pagedata structure
+                if isinstance(metadata.get('pagedata'), dict):
+                    page_count = len(metadata['pagedata'])
+                    report.add_pass('metadata', f"Page data contains {page_count} pages")
+                    report.has_valid_yaml = True
+                else:
+                    report.add_fail('metadata', "pagedata field is not a dictionary", is_warning=False)
+                    report.has_valid_yaml = False
+                    
+        except KeyError:
+            # meta.yml not found - already caught in required files check
+            report.has_valid_yaml = False
+        except Exception as e:
+            report.add_fail('metadata', f"Error reading YAML: {str(e)}", is_warning=False)
+            report.has_valid_yaml = False
+    
+    def _validate_checksums(self, zf: zipfile.ZipFile, zip_contents: List[str], report: ValidationReport):
+        """Validate MD5 checksums match file contents"""
+        try:
+            checksum_content = zf.read('checksum.md5').decode('utf-8')
+            
+            # Parse checksum file
+            checksums = {}
+            for line in checksum_content.strip().split('\n'):
+                if not line.strip():
+                    continue
+                
+                parts = line.split(None, 1)  # Split on whitespace, max 2 parts
+                if len(parts) == 2:
+                    expected_hash, filename = parts
+                    checksums[filename] = expected_hash
+            
+            if not checksums:
+                report.add_fail('integrity', "checksum.md5 file is empty", is_warning=False)
+                report.has_valid_checksums = False
+                return
+            
+            report.add_pass('integrity', f"Checksum file contains {len(checksums)} entries")
+            
+            # Verify each file in checksums exists
+            zip_set = set(zip_contents)
+            missing_files = set(checksums.keys()) - zip_set
+            
+            if missing_files:
+                report.add_fail('integrity', 
+                              f"Found {len(missing_files)} files in checksum.md5 but not in ZIP",
+                              is_warning=False)
+                for missing in sorted(missing_files)[:3]:
+                    report.add_fail('integrity', f"  Missing: {missing}", is_warning=False)
+            
+            # Compute actual checksums and compare
+            mismatches = 0
+            checked = 0
+            
+            for filename, expected_hash in checksums.items():
+                if filename == 'checksum.md5':
+                    continue  # Don't validate checksum of checksum file
+                
+                if filename not in zip_set:
+                    continue  # Already reported as missing
+                
+                try:
+                    file_data = zf.read(filename)
+                    actual_hash = self.checksum_generator.compute_md5_from_bytes(file_data)
+                    
+                    if actual_hash != expected_hash:
+                        mismatches += 1
+                        if mismatches <= 3:  # Show first 3 mismatches
+                            report.add_fail('integrity', 
+                                          f"Checksum mismatch for {filename}",
+                                          is_warning=False)
+                    else:
+                        checked += 1
+                        
+                except Exception as e:
+                    report.add_fail('integrity', 
+                                  f"Error computing checksum for {filename}: {str(e)}",
+                                  is_warning=True)
+            
+            if mismatches > 0:
+                report.add_fail('integrity', 
+                              f"Found {mismatches} checksum mismatches",
+                              is_warning=False)
+                report.has_valid_checksums = False
+            else:
+                report.add_pass('integrity', f"All {checked} checksums validated successfully")
+                report.has_valid_checksums = True
+                
+        except KeyError:
+            # checksum.md5 not found - already caught in required files check
+            report.has_valid_checksums = False
+        except Exception as e:
+            report.add_fail('integrity', f"Error reading checksums: {str(e)}", is_warning=False)
+            report.has_valid_checksums = False
+
+
+def validate_hathitrust_package(zip_path: Path) -> ValidationReport:
+    """
+    Convenience function to validate a HathiTrust package.
+    
+    Args:
+        zip_path: Path to ZIP file to validate
+        
+    Returns:
+        ValidationReport with comprehensive validation results
+    """
+    validator = PackageValidator()
+    return validator.validate_package(zip_path)
+
+
+
+if __name__ == '__main__':
+    import argparse
+    import sys
+    
+    parser = argparse.ArgumentParser(
+        description='Validate HathiTrust submission packages for compliance'
+    )
+    
+    parser.add_argument(
+        'zip_file',
+        type=Path,
+        help='Path to ZIP file to validate'
+    )
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Show detailed check results'
+    )
+    parser.add_argument(
+        '--json',
+        action='store_true',
+        help='Output report in JSON format'
+    )
+    
+    args = parser.parse_args()
+    
+    if not args.zip_file.exists():
+        logger.error(f"ZIP file not found: {args.zip_file}")
+        sys.exit(1)
+    
+    # Run validation
+    validator = PackageValidator()
+    report = validator.validate_package(args.zip_file)
+    
+    # Output results
+    if args.json:
+        import json
+        output = {
+            'package': str(report.package_path),
+            'is_valid': report.is_valid,
+            'total_checks': report.total_checks,
+            'passed_checks': report.passed_checks,
+            'failed_checks': report.failed_checks,
+            'warnings': report.warnings,
+            'errors': report.errors,
+            'volume_id': report.volume_id,
+            'file_count': report.file_count,
+            'tiff_count': report.tiff_count
+        }
+        print(json.dumps(output, indent=2))
+    else:
+        # Print summary
+        print(report.get_summary())
+        
+        if args.verbose:
+            # Print detailed checks
+            categories = [
+                ('Naming Convention', report.naming_checks),
+                ('ZIP Structure', report.structure_checks),
+                ('Content Validation', report.content_checks),
+                ('Metadata Validation', report.metadata_checks),
+                ('Integrity Checks', report.integrity_checks)
+            ]
+            
+            for category, checks in categories:
+                if checks:
+                    print(f"\n{category}:")
+                    print("-" * 70)
+                    for check in checks:
+                        print(f"  {check}")
+        
+        if report.errors:
+            print(f"\n❌ Errors ({len(report.errors)}):")
+            print("-" * 70)
+            for error in report.errors:
+                print(f"  {error}")
+        
+        if report.warnings:
+            print(f"\n⚠️  Warnings ({len(report.warnings)}):")
+            print("-" * 70)
+            for warning in report.warnings:
+                print(f"  {warning}")
+    
+    # Exit with appropriate code
+    sys.exit(0 if report.is_valid else 1)
diff --git a/volume_discovery.py b/src/volume_discovery.py
similarity index 98%
rename from volume_discovery.py
rename to src/volume_discovery.py
index c7e503a..20861c7 100755
--- a/volume_discovery.py
+++ b/src/volume_discovery.py
@@ -13,7 +13,8 @@
 
 # Regex patterns for file identification
 TIFF_PATTERN = re.compile(r'^.*?(\d{8})\.tif$', re.IGNORECASE)
-BARCODE_PATTERN = re.compile(r'^(\d+)_\d{8}\.tif$', re.IGNORECASE)
+# Updated to support alphanumeric identifiers with hyphens (e.g., mss19398-066)
+BARCODE_PATTERN = re.compile(r'^([a-z0-9\-]+)_\d{8}\.tif$', re.IGNORECASE)
 ARK_PATTERN = re.compile(r'^ark[_-](\d+)[_-]([a-z0-9]+)_\d{8}\.tif$', re.IGNORECASE)
 
 
diff --git a/src/yaml_generator.py b/src/yaml_generator.py
new file mode 100755
index 0000000..4ac32e7
--- /dev/null
+++ b/src/yaml_generator.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+"""
+YAML Metadata Generation
+Creates meta.yml files for HathiTrust submission packages
+"""
+
+import json
+import yaml
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+from datetime import datetime
+
+
+class YAMLGenerator:
+    """Generates HathiTrust-compliant meta.yml metadata files"""
+    
+    # Valid page tags for HathiTrust
+    VALID_PAGE_TAGS = {
+        'FRONT_COVER', 'BACK_COVER', 'TITLE', 'TITLE_PARTS', 
+        'TABLE_OF_CONTENTS', 'INDEX', 'BLANK', 'COPYRIGHT',
+        'FIRST_CONTENT_CHAPTER_START', 'CHAPTER_START', 'CHAPTER_PAGE',
+        'REFERENCES', 'MULTIWORK_BOUNDARY', 'IMAGE_ON_PAGE',
+        'FOLDOUT'
+    }
+    
+    def __init__(self):
+        """Initialize YAML generator"""
+        pass
+    
+    @staticmethod
+    def load_metadata_from_json(json_path: Path) -> Dict:
+        """
+        Load per-package metadata from JSON file
+        
+        Args:
+            json_path: Path to metadata JSON file
+        
+        Returns:
+            Dictionary containing metadata
+        """
+        with open(json_path, 'r', encoding='utf-8') as f:
+            metadata = json.load(f)
+        
+        logging.info(f"Loaded metadata from {json_path.name}")
+        return metadata
+    
+    @staticmethod
+    def generate_pagedata(num_pages: int, reading_order: str = 'left-to-right') -> Dict:
+        """
+        Generate pagedata section for meta.yml
+        
+        Args:
+            num_pages: Number of pages in the volume
+            reading_order: Reading order (left-to-right or right-to-left)
+        
+        Returns:
+            Dictionary with pagedata for each page
+        """
+        pagedata = {}
+        
+        for i in range(1, num_pages + 1):
+            sequence_num = f"{i:08d}"
+            
+            # Basic pagedata entry
+            page_entry = {
+                'orderlabel': sequence_num,
+                'label': sequence_num  # Default to sequence number
+            }
+            
+            # Special handling for common pages
+            if i == 1:
+                page_entry['label'] = 'FRONT_COVER'
+            elif i == num_pages:
+                page_entry['label'] = 'BACK_COVER'
+            
+            pagedata[sequence_num] = page_entry
+        
+        return pagedata
+    
+    def generate_meta_yml(self, metadata: Dict, num_pages: int, output_path: Path) -> Path:
+        """
+        Generate complete meta.yml file for HathiTrust submission
+        
+        Args:
+            metadata: Package metadata dictionary (from collect_metadata.py)
+            num_pages: Number of pages in the volume
+            output_path: Where to save the meta.yml file
+        
+        Returns:
+            Path to generated meta.yml file
+        """
+        logging.info(f"Generating meta.yml for {num_pages} pages")
+        
+        # Build meta.yml structure
+        meta = {
+            'capture_date': metadata['capture_metadata']['capture_date'],
+            'scanner_user': metadata['capture_metadata']['operator'],
+            'scanner_make': 'Phase One',  # CaptureOne manufacturer
+            'scanner_model': metadata['capture_metadata']['software'],
+            'scanning_order': metadata['page_order']['scanning_order'],
+            'reading_order': metadata['page_order']['reading_order'],
+        }
+        
+        # Add image technical specifications
+        meta['image_compression_agent'] = metadata['capture_metadata']['software']
+        meta['image_compression_date'] = metadata['capture_metadata']['capture_date']
+        
+        # Add resolution info (optional but recommended)
+        if 'image_technical' in metadata:
+            meta['resolution_dpi'] = metadata['image_technical']['resolution_dpi']
+            meta['bitdepth'] = metadata['image_technical']['bitdepth']
+        
+        # Generate pagedata
+        pagedata = self.generate_pagedata(
+            num_pages, 
+            metadata['page_order']['reading_order']
+        )
+        meta['pagedata'] = pagedata
+        
+        # Write YAML file
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        with open(output_path, 'w', encoding='utf-8') as f:
+            yaml.dump(meta, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
+        
+        logging.info(f"Generated meta.yml: {output_path}")
+        
+        # Validate the generated YAML
+        self.validate_yaml(output_path)
+        
+        return output_path
+    
+    @staticmethod
+    def validate_yaml(yaml_path: Path) -> bool:
+        """
+        Validate that the generated YAML is well-formed
+        
+        Args:
+            yaml_path: Path to YAML file to validate
+        
+        Returns:
+            True if valid, raises exception if invalid
+        """
+        try:
+            with open(yaml_path, 'r', encoding='utf-8') as f:
+                data = yaml.safe_load(f)
+            
+            # Check required fields
+            required_fields = ['capture_date', 'scanner_user', 'pagedata']
+            for field in required_fields:
+                if field not in data:
+                    raise ValueError(f"Missing required field: {field}")
+            
+            # Check pagedata structure
+            if not isinstance(data['pagedata'], dict):
+                raise ValueError("pagedata must be a dictionary")
+            
+            if len(data['pagedata']) == 0:
+                raise ValueError("pagedata cannot be empty")
+            
+            logging.info(f"✓ YAML validation passed: {yaml_path.name}")
+            return True
+            
+        except yaml.YAMLError as e:
+            logging.error(f"✗ YAML parsing error: {e}")
+            raise
+        except Exception as e:
+            logging.error(f"✗ Validation error: {e}")
+            raise
+    
+    def generate_from_volume(self, volume_id: str, metadata_json: Path, 
+                            tiff_files: List[Path], output_dir: Path) -> Path:
+        """
+        Generate meta.yml for a complete volume
+        
+        Args:
+            volume_id: Volume identifier (barcode or ARK)
+            metadata_json: Path to metadata JSON file for this volume
+            tiff_files: List of TIFF files in the volume
+            output_dir: Directory to save meta.yml
+        
+        Returns:
+            Path to generated meta.yml file
+        """
+        logging.info(f"Generating meta.yml for volume: {volume_id}")
+        
+        # Load metadata
+        metadata = self.load_metadata_from_json(metadata_json)
+        
+        # Determine number of pages
+        num_pages = len(tiff_files)
+        
+        # Generate meta.yml
+        output_path = output_dir / 'meta.yml'
+        return self.generate_meta_yml(metadata, num_pages, output_path)
+
+
+# Demo/Testing functionality
+if __name__ == "__main__":
+    import argparse
+    
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    parser = argparse.ArgumentParser(description='Generate meta.yml for HathiTrust packages')
+    parser.add_argument('metadata_json', 
+                       help='Path to metadata JSON file')
+    parser.add_argument('--num-pages', type=int,
+                       help='Number of pages (if not auto-detecting from directory)')
+    parser.add_argument('--output-dir', default='.',
+                       help='Output directory for meta.yml (default: current directory)')
+    parser.add_argument('--tiff-dir',
+                       help='Directory containing TIFF files (for auto page count)')
+    
+    args = parser.parse_args()
+    
+    try:
+        generator = YAMLGenerator()
+        metadata_path = Path(args.metadata_json)
+        
+        if not metadata_path.exists():
+            logging.error(f"Metadata file not found: {metadata_path}")
+            exit(1)
+        
+        # Determine number of pages
+        if args.num_pages:
+            num_pages = args.num_pages
+        elif args.tiff_dir:
+            tiff_dir = Path(args.tiff_dir)
+            tiff_files = list(tiff_dir.glob("*.tif")) + list(tiff_dir.glob("*.TIF"))
+            num_pages = len(tiff_files)
+            logging.info(f"Auto-detected {num_pages} TIFF files")
+        else:
+            logging.error("Must provide either --num-pages or --tiff-dir")
+            exit(1)
+        
+        # Load metadata and generate YAML
+        metadata = generator.load_metadata_from_json(metadata_path)
+        output_dir = Path(args.output_dir)
+        output_path = output_dir / 'meta.yml'
+        
+        result = generator.generate_meta_yml(metadata, num_pages, output_path)
+        
+        print(f"\n{'='*60}")
+        print("meta.yml GENERATED SUCCESSFULLY")
+        print(f"{'='*60}")
+        print(f"Output: {result}")
+        print(f"Pages: {num_pages}")
+        print(f"\nValidation: ✓ Passed")
+        
+        # Show preview
+        print(f"\n{'='*60}")
+        print("PREVIEW (first 20 lines)")
+        print(f"{'='*60}")
+        with open(result, 'r') as f:
+            lines = f.readlines()[:20]
+            print(''.join(lines))
+            if len(lines) >= 20:
+                print("... (truncated)")
+    
+    except Exception as e:
+        logging.error(f"Error: {e}")
+        exit(1)
diff --git a/src/zip_packager.py b/src/zip_packager.py
new file mode 100644
index 0000000..03493ec
--- /dev/null
+++ b/src/zip_packager.py
@@ -0,0 +1,485 @@
+#!/usr/bin/env python3
+"""
+Step 8: ZIP Archive Creation
+Creates HathiTrust-compliant ZIP archives from assembled packages.
+
+This module compresses assembled packages into properly-named ZIP files with
+flat structure (no subdirectories) as required by HathiTrust specifications.
+"""
+
+import logging
+import zipfile
+from pathlib import Path
+from typing import List, Optional
+from dataclasses import dataclass
+
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ZIPValidationResult:
+    """Result of ZIP structure validation"""
+    is_valid: bool
+    zip_path: Path
+    file_count: int
+    has_subdirectories: bool
+    missing_files: List[str]
+    extra_files: List[str]
+    errors: List[str]
+    warnings: List[str]
+
+
+class ZIPPackager:
+    """Creates HathiTrust-compliant ZIP archives from assembled packages"""
+    
+    def __init__(self, output_dir: Path):
+        """
+        Initialize ZIPPackager.
+        
+        Args:
+            output_dir: Directory where ZIP files will be created
+        """
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    
+    def create_zip_archive(self, package_dir: Path, volume_id: str) -> Optional[Path]:
+        """
+        Create ZIP archive from assembled package directory.
+        
+        Creates a flat-structure ZIP file where all files are at the root level
+        (no subdirectories), as required by HathiTrust specifications.
+        
+        Args:
+            package_dir: Path to assembled package directory
+            volume_id: Volume identifier (used for ZIP filename)
+        
+        Returns:
+            Path to created ZIP file, or None if creation failed
+            
+        Raises:
+            FileNotFoundError: If package_dir doesn't exist
+            ValueError: If package_dir is empty
+        """
+        package_dir = Path(package_dir)
+        
+        # Validate package directory exists
+        if not package_dir.exists():
+            logger.error(f"Package directory not found: {package_dir}")
+            raise FileNotFoundError(f"Package directory not found: {package_dir}")
+        
+        if not package_dir.is_dir():
+            logger.error(f"Path is not a directory: {package_dir}")
+            raise ValueError(f"Path is not a directory: {package_dir}")
+        
+        # Get list of files to archive
+        package_files = sorted([f for f in package_dir.iterdir() if f.is_file()])
+        
+        if not package_files:
+            logger.error(f"Package directory is empty: {package_dir}")
+            raise ValueError(f"Package directory is empty: {package_dir}")
+        
+        # Create ZIP filename
+        zip_filename = f"{volume_id}.zip"
+        zip_path = self.output_dir / zip_filename
+        
+        logger.info(f"Creating ZIP archive: {zip_path}")
+        logger.info(f"Files to archive: {len(package_files)}")
+        
+        try:
+            # Create ZIP with compression
+            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+                for file_path in package_files:
+                    # Use arcname to ensure flat structure (no directory paths)
+                    arcname = file_path.name
+                    
+                    # Skip macOS metadata files
+                    if arcname.startswith('._') or arcname == '.DS_Store':
+                        logger.debug(f"Skipping macOS metadata: {arcname}")
+                        continue
+                    
+                    logger.debug(f"Adding to ZIP: {arcname} ({file_path.stat().st_size} bytes)")
+                    zf.write(file_path, arcname=arcname)
+            
+            # Verify ZIP integrity
+            if self._verify_zip_integrity(zip_path):
+                logger.info(f"✓ Successfully created ZIP: {zip_path}")
+                logger.info(f"✓ ZIP file size: {zip_path.stat().st_size:,} bytes")
+                return zip_path
+            else:
+                logger.error(f"ZIP integrity check failed: {zip_path}")
+                # Clean up corrupted ZIP
+                if zip_path.exists():
+                    zip_path.unlink()
+                return None
+                
+        except Exception as e:
+            logger.error(f"Failed to create ZIP archive: {e}")
+            # Clean up partial ZIP if it exists
+            if zip_path.exists():
+                logger.debug(f"Cleaning up partial ZIP: {zip_path}")
+                zip_path.unlink()
+            raise
+    
+    def _verify_zip_integrity(self, zip_path: Path) -> bool:
+        """
+        Verify ZIP file integrity.
+        
+        Args:
+            zip_path: Path to ZIP file
+            
+        Returns:
+            True if ZIP is valid, False otherwise
+        """
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                # testzip() returns None if ZIP is valid, or name of first corrupt file
+                corrupt_file = zf.testzip()
+                if corrupt_file:
+                    logger.error(f"Corrupt file in ZIP: {corrupt_file}")
+                    return False
+                return True
+        except zipfile.BadZipFile:
+            logger.error(f"Invalid ZIP file: {zip_path}")
+            return False
+        except Exception as e:
+            logger.error(f"Error verifying ZIP: {e}")
+            return False
+    
+    def verify_zip_structure(self, zip_path: Path, expected_files: Optional[List[str]] = None) -> ZIPValidationResult:
+        """
+        Verify ZIP structure complies with HathiTrust requirements.
+        
+        Checks:
+        - Flat structure (no subdirectories)
+        - All expected files present
+        - No unexpected files
+        - ZIP integrity
+        
+        Args:
+            zip_path: Path to ZIP file to validate
+            expected_files: Optional list of expected filenames
+            
+        Returns:
+            ZIPValidationResult with validation details
+        """
+        zip_path = Path(zip_path)
+        errors = []
+        warnings = []
+        missing_files = []
+        extra_files = []
+        has_subdirectories = False
+        
+        # Check ZIP exists
+        if not zip_path.exists():
+            errors.append(f"ZIP file not found: {zip_path}")
+            return ZIPValidationResult(
+                is_valid=False,
+                zip_path=zip_path,
+                file_count=0,
+                has_subdirectories=False,
+                missing_files=[],
+                extra_files=[],
+                errors=errors,
+                warnings=warnings
+            )
+        
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                # Get list of files in ZIP
+                zip_contents = zf.namelist()
+                
+                # Check for subdirectories (any path containing '/')
+                for name in zip_contents:
+                    if '/' in name or '\\' in name:
+                        has_subdirectories = True
+                        errors.append(f"Subdirectory found in ZIP: {name}")
+                    
+                    # Check for macOS metadata
+                    if '__MACOSX' in name or name.startswith('._'):
+                        warnings.append(f"macOS metadata found: {name}")
+                
+                # Verify expected files if provided
+                if expected_files:
+                    zip_set = set(zip_contents)
+                    expected_set = set(expected_files)
+                    
+                    missing_files = sorted(expected_set - zip_set)
+                    extra_files = sorted(zip_set - expected_set)
+                    
+                    if missing_files:
+                        errors.append(f"Missing {len(missing_files)} expected files")
+                    
+                    if extra_files:
+                        # Filter out macOS metadata from extras
+                        non_meta_extras = [f for f in extra_files 
+                                          if not (f.startswith('._') or '__MACOSX' in f)]
+                        if non_meta_extras:
+                            warnings.append(f"Found {len(non_meta_extras)} unexpected files")
+                
+                # Verify ZIP integrity
+                corrupt_file = zf.testzip()
+                if corrupt_file:
+                    errors.append(f"ZIP corruption detected: {corrupt_file}")
+                
+                is_valid = len(errors) == 0
+                
+                return ZIPValidationResult(
+                    is_valid=is_valid,
+                    zip_path=zip_path,
+                    file_count=len(zip_contents),
+                    has_subdirectories=has_subdirectories,
+                    missing_files=missing_files,
+                    extra_files=extra_files,
+                    errors=errors,
+                    warnings=warnings
+                )
+                
+        except zipfile.BadZipFile:
+            errors.append("Invalid ZIP file format")
+            return ZIPValidationResult(
+                is_valid=False,
+                zip_path=zip_path,
+                file_count=0,
+                has_subdirectories=False,
+                missing_files=[],
+                extra_files=[],
+                errors=errors,
+                warnings=warnings
+            )
+        except Exception as e:
+            errors.append(f"Error reading ZIP: {str(e)}")
+            return ZIPValidationResult(
+                is_valid=False,
+                zip_path=zip_path,
+                file_count=0,
+                has_subdirectories=False,
+                missing_files=[],
+                extra_files=[],
+                errors=errors,
+                warnings=warnings
+            )
+    
+    def list_zip_contents(self, zip_path: Path) -> List[str]:
+        """
+        List all files in a ZIP archive.
+        
+        Args:
+            zip_path: Path to ZIP file
+            
+        Returns:
+            List of filenames in ZIP (sorted)
+            
+        Raises:
+            FileNotFoundError: If ZIP doesn't exist
+            zipfile.BadZipFile: If ZIP is corrupt
+        """
+        zip_path = Path(zip_path)
+        
+        if not zip_path.exists():
+            raise FileNotFoundError(f"ZIP file not found: {zip_path}")
+        
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                return sorted(zf.namelist())
+        except zipfile.BadZipFile as e:
+            logger.error(f"Invalid ZIP file: {zip_path}")
+            raise
+    
+    def extract_zip(self, zip_path: Path, extract_to: Path) -> bool:
+        """
+        Extract ZIP archive to specified directory.
+        
+        Args:
+            zip_path: Path to ZIP file
+            extract_to: Directory where files will be extracted
+            
+        Returns:
+            True if extraction successful, False otherwise
+        """
+        zip_path = Path(zip_path)
+        extract_to = Path(extract_to)
+        
+        if not zip_path.exists():
+            logger.error(f"ZIP file not found: {zip_path}")
+            return False
+        
+        try:
+            extract_to.mkdir(parents=True, exist_ok=True)
+            
+            logger.info(f"Extracting ZIP: {zip_path}")
+            logger.info(f"Extract to: {extract_to}")
+            
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                zf.extractall(extract_to)
+            
+            extracted_files = list(extract_to.iterdir())
+            logger.info(f"✓ Extracted {len(extracted_files)} files")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to extract ZIP: {e}")
+            return False
+
+
+def create_package_zip(package_dir: Path, volume_id: str, output_dir: Path) -> Optional[Path]:
+    """
+    Convenience function to create ZIP archive from package directory.
+    
+    Args:
+        package_dir: Path to assembled package directory
+        volume_id: Volume identifier for ZIP naming
+        output_dir: Directory where ZIP will be created
+        
+    Returns:
+        Path to created ZIP, or None if failed
+    """
+    packager = ZIPPackager(output_dir)
+    return packager.create_zip_archive(package_dir, volume_id)
+
+
+if __name__ == '__main__':
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description='Create HathiTrust-compliant ZIP archives from assembled packages'
+    )
+    
+    parser.add_argument(
+        'package_dir',
+        type=Path,
+        nargs='?',
+        help='Path to assembled package directory'
+    )
+    parser.add_argument(
+        '--output-dir',
+        type=Path,
+        default=Path('output'),
+        help='Directory where ZIP file will be created (default: output/)'
+    )
+    parser.add_argument(
+        '--verify',
+        type=Path,
+        metavar='ZIP_FILE',
+        help='Verify structure of existing ZIP file'
+    )
+    parser.add_argument(
+        '--list',
+        type=Path,
+        metavar='ZIP_FILE',
+        help='List contents of ZIP file'
+    )
+    parser.add_argument(
+        '--extract',
+        type=Path,
+        metavar='ZIP_FILE',
+        help='Extract ZIP file'
+    )
+    parser.add_argument(
+        '--extract-to',
+        type=Path,
+        default=Path('extracted'),
+        help='Directory for extraction (default: extracted/)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Handle --verify flag
+    if args.verify:
+        packager = ZIPPackager(Path('.'))
+        result = packager.verify_zip_structure(args.verify)
+        
+        print(f"\n{'='*60}")
+        print(f"ZIP Validation Report: {args.verify.name}")
+        print(f"{'='*60}")
+        print(f"Valid: {'✓ YES' if result.is_valid else '✗ NO'}")
+        print(f"File Count: {result.file_count}")
+        print(f"Has Subdirectories: {'✗ YES' if result.has_subdirectories else '✓ NO'}")
+        
+        if result.errors:
+            print(f"\n❌ Errors ({len(result.errors)}):")
+            for error in result.errors:
+                print(f"  - {error}")
+        
+        if result.warnings:
+            print(f"\n⚠️  Warnings ({len(result.warnings)}):")
+            for warning in result.warnings:
+                print(f"  - {warning}")
+        
+        if result.missing_files:
+            print(f"\n❌ Missing Files ({len(result.missing_files)}):")
+            for file in result.missing_files[:10]:
+                print(f"  - {file}")
+            if len(result.missing_files) > 10:
+                print(f"  ... and {len(result.missing_files) - 10} more")
+        
+        if result.extra_files:
+            print(f"\n⚠️  Extra Files ({len(result.extra_files)}):")
+            for file in result.extra_files[:10]:
+                print(f"  - {file}")
+            if len(result.extra_files) > 10:
+                print(f"  ... and {len(result.extra_files) - 10} more")
+        
+        print(f"{'='*60}\n")
+        exit(0 if result.is_valid else 1)
+    
+    # Handle --list flag
+    if args.list:
+        packager = ZIPPackager(Path('.'))
+        try:
+            contents = packager.list_zip_contents(args.list)
+            print(f"\nContents of {args.list.name} ({len(contents)} files):")
+            print(f"{'='*60}")
+            for filename in contents:
+                print(f"  {filename}")
+            print(f"{'='*60}\n")
+        except Exception as e:
+            logger.error(f"Failed to list ZIP contents: {e}")
+            exit(1)
+        exit(0)
+    
+    # Handle --extract flag
+    if args.extract:
+        packager = ZIPPackager(Path('.'))
+        success = packager.extract_zip(args.extract, args.extract_to)
+        exit(0 if success else 1)
+    
+    # Create ZIP from package directory
+    if not args.package_dir:
+        parser.error("package_dir is required when not using --verify, --list, or --extract")
+    
+    if not args.package_dir.exists():
+        logger.error(f"Package directory not found: {args.package_dir}")
+        exit(1)
+    
+    # Extract volume ID from directory name
+    volume_id = args.package_dir.name
+    
+    try:
+        packager = ZIPPackager(args.output_dir)
+        zip_path = packager.create_zip_archive(args.package_dir, volume_id)
+        
+        if zip_path:
+            print(f"\n✓ Successfully created ZIP: {zip_path}")
+            print(f"✓ ZIP size: {zip_path.stat().st_size:,} bytes")
+            
+            # Run validation
+            result = packager.verify_zip_structure(zip_path)
+            if result.is_valid:
+                print(f"✓ ZIP structure validated ({result.file_count} files)")
+            else:
+                print(f"⚠️  ZIP validation warnings: {len(result.errors)} errors, {len(result.warnings)} warnings")
+            
+            exit(0)
+        else:
+            logger.error("ZIP creation failed")
+            exit(1)
+            
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        exit(1)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..7dbd035
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,6 @@
+"""
+HathiTrust Package Automation - Test Suite
+==========================================
+
+Test modules for HathiTrust processing pipeline components.
+"""
diff --git a/test_checksum_generator.py b/tests/test_checksum_generator.py
similarity index 99%
rename from test_checksum_generator.py
rename to tests/test_checksum_generator.py
index bdc16db..fba9aab 100644
--- a/test_checksum_generator.py
+++ b/tests/test_checksum_generator.py
@@ -7,7 +7,7 @@
 import tempfile
 import shutil
 from pathlib import Path
-from checksum_generator import ChecksumGenerator, generate_package_checksums
+from src.checksum_generator import ChecksumGenerator, generate_package_checksums
 
 
 class TestChecksumGenerator:
diff --git a/test_file_validator.py b/tests/test_file_validator.py
similarity index 98%
rename from test_file_validator.py
rename to tests/test_file_validator.py
index 8aff976..bbbecc1 100644
--- a/test_file_validator.py
+++ b/tests/test_file_validator.py
@@ -7,7 +7,7 @@
 import tempfile
 import shutil
 from pathlib import Path
-from file_validator import FileValidator, FileValidationResult
+from src.file_validator import FileValidator, FileValidationResult
 
 
 class TestFileValidator(unittest.TestCase):
diff --git a/tests/test_main_pipeline.py b/tests/test_main_pipeline.py
new file mode 100644
index 0000000..dbf37d4
--- /dev/null
+++ b/tests/test_main_pipeline.py
@@ -0,0 +1,243 @@
+"""
+Integration tests for main_pipeline.py
+
+These tests verify end-to-end pipeline functionality including:
+- Single volume processing
+- Batch processing
+- Error recovery
+- Resume mode
+- Missing metadata handling
+"""
+
+import pytest
+from pathlib import Path
+import json
+import shutil
+from PIL import Image
+import zipfile
+
+from main_pipeline import (
+    PipelineConfig,
+    VolumeResult,
+    ProcessingResults,
+    load_configuration,
+    check_metadata_file,
+    check_existing_package,
+    process_volume,
+    main_pipeline,
+    generate_reports
+)
+from volume_discovery import VolumeGroup
+
+
+@pytest.fixture
+def test_dirs(tmp_path):
+    """Create test directory structure."""
+    dirs = {
+        'input': tmp_path / 'input',
+        'output': tmp_path / 'output',
+        'temp': tmp_path / 'temp',
+        'logs': tmp_path / 'logs'
+    }
+    
+    for dir_path in dirs.values():
+        dir_path.mkdir(parents=True, exist_ok=True)
+    
+    return dirs
+
+
+@pytest.fixture
+def sample_config(test_dirs, tmp_path):
+    """Create sample pipeline configuration."""
+    return PipelineConfig(
+        input_dir=test_dirs['input'],
+        output_dir=test_dirs['output'],
+        temp_dir=test_dirs['temp'],
+        logs_dir=test_dirs['logs'],
+        config_path=tmp_path / 'config.yaml',
+        ocr_language='eng',
+        resume_mode=False,
+        keep_temp=False,
+        verbose=False
+    )
+
+
+@pytest.fixture
+def create_test_volume(test_dirs, tmp_path):
+    """Factory fixture to create test volumes with TIFF files and metadata."""
+    def _create_volume(volume_id, num_pages=3):
+        volume_dir = test_dirs['input'] / volume_id
+        volume_dir.mkdir(exist_ok=True)
+        
+        # Create test TIFF files
+        tiff_files = []
+        for i in range(1, num_pages + 1):
+            seq = str(i).zfill(8)
+            filename = f"{volume_id}_{seq}.tif"
+            filepath = volume_dir / filename
+            
+            # Create simple test image
+            img = Image.new('L', (100, 100), color=255)
+            img.save(filepath, 'TIFF')
+            tiff_files.append(filepath)
+        
+        # Create metadata JSON
+        metadata = {
+            'volume_id': volume_id,
+            'capture_date': '2025-10-01',
+            'scanner_user': 'test_user',
+            'scanner_make': 'Test',
+            'scanner_model': 'Scanner',
+            'image_compression_date': '2025-10-01',
+            'image_compression_tool': 'Test Tool',
+            'resolution_dpi': 600,
+            'scanning_order': 'left-to-right',
+            'reading_order': 'left-to-right'
+        }
+        
+        metadata_path = tmp_path / f'metadata_{volume_id}.json'
+        with open(metadata_path, 'w') as f:
+            json.dump(metadata, f)
+        
+        return {
+            'volume_id': volume_id,
+            'tiff_files': tiff_files,
+            'metadata_path': metadata_path,
+            'num_pages': num_pages
+        }
+    
+    return _create_volume
+
+
+# Test: Check metadata file
+def test_check_metadata_file(tmp_path):
+    """Test metadata file checking."""
+    volume_id = '12345678'
+    
+    # Should raise FileNotFoundError when metadata doesn't exist
+    with pytest.raises(FileNotFoundError):
+        check_metadata_file(volume_id)
+    
+    # Create metadata file
+    metadata_path = tmp_path / f'metadata_{volume_id}.json'
+    metadata_path.write_text('{}')
+    
+    # Change to tmp directory
+    import os
+    old_cwd = os.getcwd()
+    os.chdir(tmp_path)
+    
+    try:
+        # Should return path when metadata exists
+        result = check_metadata_file(volume_id)
+        assert result == metadata_path
+    finally:
+        os.chdir(old_cwd)
+
+
+# Test: Check existing package
+def test_check_existing_package(test_dirs, create_test_volume):
+    """Test existing package detection."""
+    volume_id = '12345678'
+    
+    # No package exists yet
+    result = check_existing_package(volume_id, test_dirs['output'])
+    assert result is None
+    
+    # Create empty ZIP (invalid)
+    zip_path = test_dirs['output'] / f'{volume_id}.zip'
+    with zipfile.ZipFile(zip_path, 'w'):
+        pass
+    
+    # Should return None for invalid package
+    result = check_existing_package(volume_id, test_dirs['output'])
+    assert result is None
+
+
+# Test: Processing results
+def test_processing_results():
+    """Test ProcessingResults data class."""
+    results = ProcessingResults()
+    
+    # Initially empty
+    assert results.total_volumes == 0
+    assert results.success_rate == 0.0
+    
+    # Add successful volume
+    results.successful_volumes.append(
+        VolumeResult(volume_id='123', status='SUCCESS', processing_time=10.0)
+    )
+    assert results.total_volumes == 1
+    assert results.success_rate == 100.0
+    
+    # Add failed volume
+    results.failed_volumes.append(
+        VolumeResult(
+            volume_id='456', 
+            status='FAILED',
+            failed_stage='ocr',
+            error_message='Test error',
+            processing_time=5.0
+        )
+    )
+    assert results.total_volumes == 2
+    assert results.success_rate == 50.0
+
+
+# Test: Report generation
+def test_generate_reports(sample_config):
+    """Test report generation."""
+    results = ProcessingResults()
+    
+    # Add test results
+    results.successful_volumes.append(
+        VolumeResult(
+            volume_id='123',
+            status='SUCCESS',
+            output_zip_path=Path('/test/123.zip'),
+            processing_time=10.5
+        )
+    )
+    
+    results.failed_volumes.append(
+        VolumeResult(
+            volume_id='456',
+            status='FAILED',
+            failed_stage='ocr',
+            error_message='OCR failed',
+            processing_time=3.2
+        )
+    )
+    
+    # Generate reports
+    csv_path = generate_reports(results, sample_config)
+    
+    # Verify CSV created
+    assert csv_path.exists()
+    assert csv_path.name.startswith('processing_report_')
+    assert csv_path.suffix == '.csv'
+    
+    # Verify JSON created
+    json_path = csv_path.with_suffix('.json')
+    assert json_path.exists()
+    
+    # Verify CSV content
+    with open(csv_path) as f:
+        content = f.read()
+        assert '123' in content
+        assert '456' in content
+        assert 'SUCCESS' in content
+        assert 'FAILED' in content
+    
+    # Verify JSON structure
+    with open(json_path) as f:
+        data = json.load(f)
+        assert 'summary' in data
+        assert 'volumes' in data
+        assert data['summary']['total_volumes'] == 2
+        assert data['summary']['successful'] == 1
+        assert data['summary']['failed'] == 1
+
+
+# Note: Full integration tests requiring Tesseract OCR would go here
+# These require actual OCR capabilities and are best run in CI/CD environment
diff --git a/test_ocr_processor.py b/tests/test_ocr_processor.py
similarity index 98%
rename from test_ocr_processor.py
rename to tests/test_ocr_processor.py
index 9703716..4a9d38f 100644
--- a/test_ocr_processor.py
+++ b/tests/test_ocr_processor.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 import tempfile
 import shutil
-from ocr_processor import OCRProcessor
+from src.ocr_processor import OCRProcessor
 
 
 class TestOCRProcessor(unittest.TestCase):
diff --git a/tests/test_package_assembler.py b/tests/test_package_assembler.py
new file mode 100644
index 0000000..59b3cf9
--- /dev/null
+++ b/tests/test_package_assembler.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+Tests for package_assembler.py
+"""
+
+import pytest
+from pathlib import Path
+import shutil
+import tempfile
+
+from src.package_assembler import PackageAssembler, PackageValidationResult
+
+
+class TestPackageAssembler:
+    """Test suite for PackageAssembler"""
+    
+    @pytest.fixture
+    def temp_dirs(self):
+        """Create temporary directories for testing"""
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            
+            # Create test directories
+            input_dir = temp_path / "input"
+            ocr_dir = temp_path / "ocr"
+            output_dir = temp_path / "output"
+            
+            input_dir.mkdir()
+            ocr_dir.mkdir()
+            output_dir.mkdir()
+            
+            yield {
+                'input': input_dir,
+                'ocr': ocr_dir,
+                'output': output_dir,
+                'temp': temp_path
+            }
+    
+    @pytest.fixture
+    def sample_files(self, temp_dirs):
+        """Create sample TIFF, TXT, and HTML files"""
+        input_dir = temp_dirs['input']
+        ocr_dir = temp_dirs['ocr']
+        
+        # Create 3 sample pages
+        tiff_files = []
+        txt_files = []
+        html_files = []
+        
+        for i in range(1, 4):  # Pages 1-3
+            seq = f"{i:08d}"
+            
+            # Create TIFF
+            tiff_path = input_dir / f"{seq}.tif"
+            tiff_path.write_text(f"Mock TIFF data for page {seq}")
+            tiff_files.append(tiff_path)
+            
+            # Create TXT
+            txt_path = ocr_dir / f"{seq}.txt"
+            txt_path.write_text(f"Mock OCR text for page {seq}")
+            txt_files.append(txt_path)
+            
+            # Create HTML
+            html_path = ocr_dir / f"{seq}.html"
+            html_path.write_text(f"<html>Mock hOCR for page {seq}</html>")
+            html_files.append(html_path)
+        
+        # Create meta.yml
+        meta_yml = ocr_dir / "meta.yml"
+        meta_yml.write_text("capture_date: '2025-09-30'\nscanner_user: 'test'")
+        
+        return {
+            'tiff': tiff_files,
+            'txt': txt_files,
+            'html': html_files,
+            'meta_yml': meta_yml
+        }
+    
+    def test_create_package_directory(self, temp_dirs):
+        """Test package directory creation"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        
+        assert temp_dirs['output'].exists()
+        assert assembler.output_base_dir == temp_dirs['output']
+    
+    def test_copy_files_to_package(self, temp_dirs, sample_files):
+        """Test file copying to package directory"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        package_dir = temp_dirs['output'] / "test_volume"
+        package_dir.mkdir()
+        
+        # Copy TIFF files
+        copied = assembler.copy_files_to_package(
+            sample_files['tiff'],
+            package_dir
+        )
+        
+        assert len(copied) == 3
+        for copied_file in copied:
+            assert copied_file.exists()
+            assert copied_file.parent == package_dir
+    
+    def test_assemble_complete_package(self, temp_dirs, sample_files):
+        """Test complete package assembly"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        
+        package_dir = assembler.assemble_package(
+            volume_id="39015012345678",
+            tiff_files=sample_files['tiff'],
+            text_files=sample_files['txt'],
+            hocr_files=sample_files['html'],
+            meta_yml=sample_files['meta_yml'],
+            generate_checksum=False  # Skip checksum for this test
+        )
+        
+        # Check package was created
+        assert package_dir.exists()
+        assert package_dir.name == "39015012345678"
+        
+        # Check files were copied
+        assert (package_dir / "00000001.tif").exists()
+        assert (package_dir / "00000001.txt").exists()
+        assert (package_dir / "00000001.html").exists()
+        assert (package_dir / "meta.yml").exists()
+    
+    def test_validate_flat_structure(self, temp_dirs):
+        """Test validation detects subdirectories"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        package_dir = temp_dirs['output'] / "test_package"
+        package_dir.mkdir()
+        
+        # Create a subdirectory (not allowed)
+        subdir = package_dir / "subdir"
+        subdir.mkdir()
+        
+        # Create valid files
+        (package_dir / "00000001.tif").write_text("test")
+        (package_dir / "00000001.txt").write_text("test")
+        (package_dir / "00000001.html").write_text("test")
+        (package_dir / "meta.yml").write_text("test")
+        
+        # Validate
+        result = assembler.validate_package_structure(package_dir)
+        
+        assert not result.is_valid
+        assert any("Subdirectories found" in error for error in result.errors)
+    
+    def test_validate_missing_meta_yml(self, temp_dirs):
+        """Test validation detects missing meta.yml"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        package_dir = temp_dirs['output'] / "test_package"
+        package_dir.mkdir()
+        
+        # Create files but no meta.yml
+        (package_dir / "00000001.tif").write_text("test")
+        (package_dir / "00000001.txt").write_text("test")
+        (package_dir / "00000001.html").write_text("test")
+        
+        result = assembler.validate_package_structure(package_dir)
+        
+        assert not result.is_valid
+        assert any("meta.yml" in error for error in result.errors)
+    
+    def test_validate_triplet_completeness(self, temp_dirs):
+        """Test validation detects missing triplet files"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        package_dir = temp_dirs['output'] / "test_package"
+        package_dir.mkdir()
+        
+        # Create TIFF and TXT but missing HTML
+        (package_dir / "00000001.tif").write_text("test")
+        (package_dir / "00000001.txt").write_text("test")
+        # Missing: 00000001.html
+        (package_dir / "meta.yml").write_text("test")
+        
+        result = assembler.validate_package_structure(package_dir)
+        
+        assert not result.is_valid
+        assert any("missing corresponding HTML" in error for error in result.errors)
+    
+    def test_validate_sequential_numbering(self, temp_dirs):
+        """Test validation detects gaps in sequence"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        package_dir = temp_dirs['output'] / "test_package"
+        package_dir.mkdir()
+        
+        # Create files with gap (1, 2, 4 - missing 3)
+        for seq in ["00000001", "00000002", "00000004"]:
+            (package_dir / f"{seq}.tif").write_text("test")
+            (package_dir / f"{seq}.txt").write_text("test")
+            (package_dir / f"{seq}.html").write_text("test")
+        (package_dir / "meta.yml").write_text("test")
+        
+        result = assembler.validate_package_structure(package_dir)
+        
+        assert not result.is_valid
+        assert any("Non-sequential" in error for error in result.errors)
+    
+    def test_with_checksum_generation(self, temp_dirs, sample_files):
+        """Test package assembly with automatic checksum generation"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        
+        package_dir = assembler.assemble_package(
+            volume_id="39015012345678",
+            tiff_files=sample_files['tiff'],
+            text_files=sample_files['txt'],
+            hocr_files=sample_files['html'],
+            meta_yml=sample_files['meta_yml'],
+            generate_checksum=True
+        )
+        
+        # Check checksum.md5 was generated
+        checksum_file = package_dir / "checksum.md5"
+        assert checksum_file.exists()
+        
+        # Check checksum file contains entries
+        content = checksum_file.read_text()
+        assert "00000001.tif" in content
+        assert "meta.yml" in content
+    
+    def test_validate_valid_package(self, temp_dirs):
+        """Test validation passes for valid package"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        package_dir = temp_dirs['output'] / "test_package"
+        package_dir.mkdir()
+        
+        # Create complete, valid package
+        for i in range(1, 4):
+            seq = f"{i:08d}"
+            (package_dir / f"{seq}.tif").write_text("test")
+            (package_dir / f"{seq}.txt").write_text("test")
+            (package_dir / f"{seq}.html").write_text("test")
+        (package_dir / "meta.yml").write_text("test")
+        
+        result = assembler.validate_package_structure(package_dir)
+        
+        assert result.is_valid
+        assert len(result.errors) == 0
+        assert result.total_files == 10  # 3 tiff + 3 txt + 3 html + 1 meta.yml
+    
+    def test_missing_metadata_error(self, temp_dirs, sample_files):
+        """Test error when meta.yml is missing"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        
+        # Point to non-existent meta.yml
+        fake_meta = temp_dirs['temp'] / "nonexistent.yml"
+        
+        with pytest.raises(ValueError, match="meta.yml not found"):
+            assembler.assemble_package(
+                volume_id="39015012345678",
+                tiff_files=sample_files['tiff'],
+                text_files=sample_files['txt'],
+                hocr_files=sample_files['html'],
+                meta_yml=fake_meta,
+                generate_checksum=False
+            )
+    
+    def test_nonexistent_package_validation(self, temp_dirs):
+        """Test validation of non-existent package"""
+        assembler = PackageAssembler(temp_dirs['output'])
+        fake_package = temp_dirs['output'] / "nonexistent"
+        
+        result = assembler.validate_package_structure(fake_package)
+        
+        assert not result.is_valid
+        assert any("does not exist" in error for error in result.errors)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_package_validator.py b/tests/test_package_validator.py
new file mode 100644
index 0000000..bb07491
--- /dev/null
+++ b/tests/test_package_validator.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+"""
+Tests for Step 9: Quality Control & Validation
+"""
+
+import pytest
+import zipfile
+import yaml
+from pathlib import Path
+from src.package_validator import PackageValidator, ValidationReport, validate_hathitrust_package
+
+
+@pytest.fixture
+def temp_dirs(tmp_path):
+    """Create temporary directories for testing"""
+    package_dir = tmp_path / "package"
+    output_dir = tmp_path / "output"
+    
+    package_dir.mkdir()
+    output_dir.mkdir()
+    
+    return {
+        'package': package_dir,
+        'output': output_dir,
+        'tmp': tmp_path
+    }
+
+
+@pytest.fixture
+def valid_package_zip(temp_dirs):
+    """Create a valid HathiTrust package ZIP for testing"""
+    package_dir = temp_dirs['package']
+    
+    # Create triplets (5 pages)
+    for i in range(1, 6):
+        seq = f"{i:08d}"
+        (package_dir / f"{seq}.tif").write_text(f"TIFF content {i}")
+        (package_dir / f"{seq}.txt").write_text(f"Text OCR {i}")
+        (package_dir / f"{seq}.html").write_text(f"<html>hOCR {i}</html>")
+    
+    # Create valid meta.yml
+    metadata = {
+        'capture_date': '2025-09-30',
+        'scanner_user': 'testuser',
+        'scanner_make': 'TestScanner',
+        'scanner_model': 'Model1',
+        'scanning_order': 'left-to-right',
+        'reading_order': 'left-to-right',
+        'pagedata': {
+            '00000001': {'orderlabel': '00000001', 'label': 'FRONT_COVER'},
+            '00000002': {'orderlabel': '00000002', 'label': '00000002'},
+            '00000003': {'orderlabel': '00000003', 'label': '00000003'},
+            '00000004': {'orderlabel': '00000004', 'label': '00000004'},
+            '00000005': {'orderlabel': '00000005', 'label': 'BACK_COVER'}
+        }
+    }
+    yaml_content = yaml.dump(metadata, default_flow_style=False)
+    (package_dir / "meta.yml").write_text(yaml_content)
+    
+    # Create checksum.md5
+    from checksum_generator import ChecksumGenerator
+    generator = ChecksumGenerator()
+    
+    checksum_entries = []
+    for file in sorted(package_dir.iterdir()):
+        if file.name != 'checksum.md5':
+            md5_hash = generator.compute_md5(str(file))
+            checksum_entries.append(f"{md5_hash}  {file.name}")
+    
+    (package_dir / "checksum.md5").write_text('\n'.join(checksum_entries))
+    
+    # Create ZIP
+    zip_path = temp_dirs['output'] / "39015012345678.zip"
+    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for file in package_dir.iterdir():
+            zf.write(file, arcname=file.name)
+    
+    return zip_path
+
+
+def test_validate_valid_package(valid_package_zip):
+    """Test validation of a completely valid package"""
+    validator = PackageValidator()
+    report = validator.validate_package(valid_package_zip)
+    
+    assert report.is_valid
+    assert report.failed_checks == 0
+    assert report.passed_checks > 0
+    assert report.has_required_files
+    assert report.has_valid_triplets
+    assert report.has_valid_yaml
+    assert report.has_valid_checksums
+
+
+def test_naming_convention_barcode(temp_dirs):
+    """Test validation of barcode-style identifier"""
+    zip_path = temp_dirs['output'] / "39015012345678.zip"
+    
+    # Create minimal valid ZIP
+    package_dir = temp_dirs['package']
+    (package_dir / "00000001.tif").write_text("test")
+    (package_dir / "00000001.txt").write_text("test")
+    (package_dir / "00000001.html").write_text("test")
+    (package_dir / "meta.yml").write_text("capture_date: 2025-09-30\nscanner_user: test\npagedata: {}")
+    (package_dir / "checksum.md5").write_text("abc123  00000001.tif")
+    
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        for file in package_dir.iterdir():
+            zf.write(file, arcname=file.name)
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert report.volume_id == "39015012345678"
+    assert any("Valid barcode identifier" in check for check in report.naming_checks)
+
+
+def test_naming_convention_ark(temp_dirs):
+    """Test validation of ARK-style identifier"""
+    zip_path = temp_dirs['output'] / "ark_12345_abc.zip"
+    
+    # Create minimal valid ZIP
+    package_dir = temp_dirs['package']
+    (package_dir / "00000001.tif").write_text("test")
+    (package_dir / "00000001.txt").write_text("test")
+    (package_dir / "00000001.html").write_text("test")
+    (package_dir / "meta.yml").write_text("capture_date: 2025-09-30\nscanner_user: test\npagedata: {}")
+    (package_dir / "checksum.md5").write_text("abc123  00000001.tif")
+    
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        for file in package_dir.iterdir():
+            zf.write(file, arcname=file.name)
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert report.volume_id == "ark_12345_abc"
+    assert any("Valid ARK identifier" in check for check in report.naming_checks)
+
+
+
+def test_detect_subdirectories(temp_dirs):
+    """Test detection of subdirectories in ZIP"""
+    zip_path = temp_dirs['output'] / "test.zip"
+    
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        zf.writestr("subdir/00000001.tif", "test")
+        zf.writestr("meta.yml", "capture_date: 2025-09-30")
+        zf.writestr("checksum.md5", "abc123  00000001.tif")
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert any("subdirectories" in error.lower() for error in report.errors)
+
+
+def test_missing_required_files(temp_dirs):
+    """Test detection of missing required files"""
+    zip_path = temp_dirs['output'] / "39015012345678.zip"
+    
+    # Create ZIP without meta.yml
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        zf.writestr("00000001.tif", "test")
+        zf.writestr("00000001.txt", "test")
+        zf.writestr("00000001.html", "test")
+        # Missing: meta.yml
+        zf.writestr("checksum.md5", "abc123  00000001.tif")
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert not report.has_required_files
+    assert any("meta.yml" in error for error in report.errors)
+
+
+def test_missing_triplet_companions(temp_dirs):
+    """Test detection of incomplete file triplets"""
+    zip_path = temp_dirs['output'] / "39015012345678.zip"
+    
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        # Missing .txt for 00000001
+        zf.writestr("00000001.tif", "test")
+        zf.writestr("00000001.html", "test")
+        
+        # Missing .html for 00000002
+        zf.writestr("00000002.tif", "test")
+        zf.writestr("00000002.txt", "test")
+        
+        zf.writestr("meta.yml", "capture_date: 2025-09-30\nscanner_user: test\npagedata: {}")
+        zf.writestr("checksum.md5", "abc123  00000001.tif")
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert not report.has_valid_triplets
+    assert any("without matching TXT" in error for error in report.errors)
+    assert any("without matching HTML" in error for error in report.errors)
+
+
+def test_sequence_gaps(temp_dirs):
+    """Test detection of gaps in sequential numbering"""
+    zip_path = temp_dirs['output'] / "39015012345678.zip"
+    
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        # Sequence: 1, 2, 4 (missing 3)
+        for seq in [1, 2, 4]:
+            seq_str = f"{seq:08d}"
+            zf.writestr(f"{seq_str}.tif", "test")
+            zf.writestr(f"{seq_str}.txt", "test")
+            zf.writestr(f"{seq_str}.html", "test")
+        
+        zf.writestr("meta.yml", "capture_date: 2025-09-30\nscanner_user: test\npagedata: {}")
+        zf.writestr("checksum.md5", "abc123  00000001.tif")
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert any("gaps in sequential numbering" in error.lower() for error in report.errors)
+
+
+def test_invalid_yaml_structure(temp_dirs):
+    """Test detection of invalid YAML metadata"""
+    zip_path = temp_dirs['output'] / "39015012345678.zip"
+    
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        zf.writestr("00000001.tif", "test")
+        zf.writestr("00000001.txt", "test")
+        zf.writestr("00000001.html", "test")
+        
+        # Missing required fields
+        zf.writestr("meta.yml", "some_field: value\nother_field: value")
+        
+        zf.writestr("checksum.md5", "abc123  00000001.tif")
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert not report.has_valid_yaml
+    assert any("Missing required YAML field" in error for error in report.errors)
+
+
+
+def test_checksum_mismatch(temp_dirs):
+    """Test detection of checksum mismatches"""
+    zip_path = temp_dirs['output'] / "39015012345678.zip"
+    
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        zf.writestr("00000001.tif", "test content")
+        zf.writestr("00000001.txt", "test")
+        zf.writestr("00000001.html", "test")
+        
+        zf.writestr("meta.yml", "capture_date: 2025-09-30\nscanner_user: test\npagedata: {'00000001': {}}")
+        
+        # Wrong checksum for 00000001.tif
+        zf.writestr("checksum.md5", "wronghash123  00000001.tif\n")
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert not report.has_valid_checksums
+    assert any("Checksum mismatch" in error for error in report.errors)
+
+
+def test_empty_checksum_file(temp_dirs):
+    """Test detection of empty checksum file"""
+    zip_path = temp_dirs['output'] / "39015012345678.zip"
+    
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        zf.writestr("00000001.tif", "test")
+        zf.writestr("00000001.txt", "test")
+        zf.writestr("00000001.html", "test")
+        zf.writestr("meta.yml", "capture_date: 2025-09-30\nscanner_user: test\npagedata: {}")
+        zf.writestr("checksum.md5", "")  # Empty
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert any("checksum.md5 file is empty" in error for error in report.errors)
+
+
+def test_nonexistent_zip(temp_dirs):
+    """Test validation of non-existent ZIP file"""
+    zip_path = temp_dirs['output'] / "nonexistent.zip"
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert any("not found" in error for error in report.errors)
+
+
+def test_corrupt_zip(temp_dirs):
+    """Test validation of corrupt ZIP file"""
+    zip_path = temp_dirs['output'] / "corrupt.zip"
+    
+    # Create corrupt ZIP
+    zip_path.write_text("This is not a valid ZIP file")
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert not report.is_valid
+    assert any("Invalid ZIP" in error or "BadZipFile" in str(error) for error in report.errors)
+
+
+def test_validation_report_summary(valid_package_zip):
+    """Test validation report summary generation"""
+    validator = PackageValidator()
+    report = validator.validate_package(valid_package_zip)
+    
+    summary = report.get_summary()
+    
+    assert "HathiTrust Package Validation Report" in summary
+    assert "39015012345678" in summary
+    assert "VALID" in summary
+    assert str(report.total_checks) in summary
+    assert str(report.passed_checks) in summary
+
+
+def test_convenience_function(valid_package_zip):
+    """Test convenience function for validation"""
+    report = validate_hathitrust_package(valid_package_zip)
+    
+    assert isinstance(report, ValidationReport)
+    assert report.is_valid
+    assert report.package_path == valid_package_zip
+
+
+def test_large_package_validation(temp_dirs):
+    """Test validation of larger package (100 pages)"""
+    package_dir = temp_dirs['package']
+    
+    # Create 100-page package
+    for i in range(1, 101):
+        seq = f"{i:08d}"
+        (package_dir / f"{seq}.tif").write_text(f"TIFF {i}")
+        (package_dir / f"{seq}.txt").write_text(f"Text {i}")
+        (package_dir / f"{seq}.html").write_text(f"hOCR {i}")
+    
+    # Create metadata
+    metadata = {
+        'capture_date': '2025-09-30',
+        'scanner_user': 'test',
+        'pagedata': {f"{i:08d}": {} for i in range(1, 101)}
+    }
+    (package_dir / "meta.yml").write_text(yaml.dump(metadata))
+    
+    # Create checksums
+    from checksum_generator import ChecksumGenerator
+    generator = ChecksumGenerator()
+    checksum_entries = []
+    for file in sorted(package_dir.iterdir()):
+        if file.name != 'checksum.md5':
+            md5_hash = generator.compute_md5(str(file))
+            checksum_entries.append(f"{md5_hash}  {file.name}")
+    (package_dir / "checksum.md5").write_text('\n'.join(checksum_entries))
+    
+    # Create ZIP
+    zip_path = temp_dirs['output'] / "39015099999999.zip"
+    with zipfile.ZipFile(zip_path, 'w') as zf:
+        for file in package_dir.iterdir():
+            zf.write(file, arcname=file.name)
+    
+    validator = PackageValidator()
+    report = validator.validate_package(zip_path)
+    
+    assert report.is_valid
+    assert report.tiff_count == 100
+    assert report.file_count == 302  # 100*3 + meta.yml + checksum.md5
diff --git a/test_volume_discovery.py b/tests/test_volume_discovery.py
similarity index 99%
rename from test_volume_discovery.py
rename to tests/test_volume_discovery.py
index 71cecb7..3fafb53 100644
--- a/test_volume_discovery.py
+++ b/tests/test_volume_discovery.py
@@ -5,7 +5,7 @@
 
 import unittest
 from pathlib import Path
-from volume_discovery import (
+from src.volume_discovery import (
     extract_sequence_number,
     extract_barcode_or_ark,
     VolumeGroup
diff --git a/tests/test_yaml_generator.py b/tests/test_yaml_generator.py
new file mode 100644
index 0000000..42fff1a
--- /dev/null
+++ b/tests/test_yaml_generator.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""
+Unit tests for yaml_generator module
+"""
+
+import unittest
+import tempfile
+import shutil
+import json
+import yaml
+from pathlib import Path
+from src.yaml_generator import YAMLGenerator
+
+
+class TestYAMLGenerator(unittest.TestCase):
+    
+    @classmethod
+    def setUpClass(cls):
+        """Set up test fixtures"""
+        cls.temp_dir = Path(tempfile.mkdtemp())
+        
+        # Create sample metadata JSON
+        cls.test_metadata = {
+            'volume_identifier': {
+                'type': 'barcode',
+                'value': '39015012345678'
+            },
+            'capture_metadata': {
+                'capture_date': '2025-01-15',
+                'operator': 'Test User',
+                'software': 'CaptureOne Cultural Heritage Edition',
+                'software_version': '23.1.0'
+            },
+            'image_technical': {
+                'resolution_dpi': 400,
+                'color_mode': 'grayscale',
+                'bitdepth': 8,
+                'compression': 'None',
+                'file_format': 'TIFF'
+            },
+            'page_order': {
+                'scanning_order': 'left-to-right',
+                'reading_order': 'left-to-right'
+            },
+            'content_description': {
+                'material_type': 'book',
+                'language': 'eng',
+                'notes': 'Test book'
+            }
+        }
+        
+        # Save test metadata to JSON file
+        cls.metadata_json = cls.temp_dir / 'metadata_test.json'
+        with open(cls.metadata_json, 'w', encoding='utf-8') as f:
+            json.dump(cls.test_metadata, f, indent=2)
+    
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test directory"""
+        if cls.temp_dir.exists():
+            shutil.rmtree(cls.temp_dir)
+    
+    def test_load_metadata_from_json(self):
+        """Test loading metadata from JSON file"""
+        generator = YAMLGenerator()
+        metadata = generator.load_metadata_from_json(self.metadata_json)
+        
+        self.assertIn('capture_metadata', metadata)
+        self.assertIn('page_order', metadata)
+        self.assertEqual(metadata['capture_metadata']['operator'], 'Test User')
+    
+    def test_generate_pagedata(self):
+        """Test pagedata generation"""
+        generator = YAMLGenerator()
+        pagedata = generator.generate_pagedata(5, 'left-to-right')
+        
+        # Check structure
+        self.assertEqual(len(pagedata), 5)
+        self.assertIn('00000001', pagedata)
+        self.assertIn('00000005', pagedata)
+        
+        # Check first page is marked as front cover
+        self.assertEqual(pagedata['00000001']['label'], 'FRONT_COVER')
+        
+        # Check last page is marked as back cover
+        self.assertEqual(pagedata['00000005']['label'], 'BACK_COVER')
+        
+        # Check middle pages have sequence numbers
+        self.assertEqual(pagedata['00000003']['orderlabel'], '00000003')
+    
+    def test_generate_meta_yml(self):
+        """Test complete meta.yml generation"""
+        generator = YAMLGenerator()
+        output_path = self.temp_dir / 'test_meta.yml'
+        
+        result = generator.generate_meta_yml(self.test_metadata, 10, output_path)
+        
+        # Check file was created
+        self.assertTrue(result.exists())
+        
+        # Load and validate YAML
+        with open(result, 'r', encoding='utf-8') as f:
+            meta = yaml.safe_load(f)
+        
+        # Check required fields
+        self.assertIn('capture_date', meta)
+        self.assertIn('scanner_user', meta)
+        self.assertIn('pagedata', meta)
+        
+        # Check values
+        self.assertEqual(meta['capture_date'], '2025-01-15')
+        self.assertEqual(meta['scanner_user'], 'Test User')
+        self.assertEqual(meta['scanning_order'], 'left-to-right')
+        self.assertEqual(meta['reading_order'], 'left-to-right')
+        
+        # Check pagedata
+        self.assertEqual(len(meta['pagedata']), 10)
+    
+    def test_validate_yaml(self):
+        """Test YAML validation"""
+        generator = YAMLGenerator()
+        
+        # Create valid YAML
+        valid_yaml = self.temp_dir / 'valid.yml'
+        with open(valid_yaml, 'w') as f:
+            yaml.dump({
+                'capture_date': '2025-01-15',
+                'scanner_user': 'Test',
+                'pagedata': {'00000001': {'orderlabel': '00000001'}}
+            }, f)
+        
+        # Should pass validation
+        self.assertTrue(generator.validate_yaml(valid_yaml))
+        
+        # Create invalid YAML (missing required field)
+        invalid_yaml = self.temp_dir / 'invalid.yml'
+        with open(invalid_yaml, 'w') as f:
+            yaml.dump({'capture_date': '2025-01-15'}, f)  # Missing scanner_user and pagedata
+        
+        # Should fail validation
+        with self.assertRaises(ValueError):
+            generator.validate_yaml(invalid_yaml)
+    
+    def test_generate_from_volume(self):
+        """Test complete volume metadata generation"""
+        generator = YAMLGenerator()
+        
+        # Create fake TIFF files
+        tiff_dir = self.temp_dir / 'tiffs'
+        tiff_dir.mkdir()
+        tiff_files = []
+        for i in range(1, 4):
+            tiff = tiff_dir / f'{i:08d}.tif'
+            tiff.touch()
+            tiff_files.append(tiff)
+        
+        # Generate meta.yml
+        output_dir = self.temp_dir / 'output'
+        output_dir.mkdir()
+        
+        result = generator.generate_from_volume(
+            '39015012345678',
+            self.metadata_json,
+            tiff_files,
+            output_dir
+        )
+        
+        # Verify
+        self.assertTrue(result.exists())
+        self.assertEqual(result.name, 'meta.yml')
+        
+        # Load and check
+        with open(result, 'r') as f:
+            meta = yaml.safe_load(f)
+        
+        self.assertEqual(len(meta['pagedata']), 3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_zip_packager.py b/tests/test_zip_packager.py
new file mode 100644
index 0000000..0268a79
--- /dev/null
+++ b/tests/test_zip_packager.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""
+Tests for Step 8: ZIP Archive Creation
+"""
+
+import pytest
+import zipfile
+from pathlib import Path
+from src.zip_packager import ZIPPackager, ZIPValidationResult, create_package_zip
+
+
+@pytest.fixture
+def temp_dirs(tmp_path):
+    """Create temporary directories for testing"""
+    package_dir = tmp_path / "package"
+    output_dir = tmp_path / "output"
+    extract_dir = tmp_path / "extracted"
+    
+    package_dir.mkdir()
+    output_dir.mkdir()
+    
+    return {
+        'package': package_dir,
+        'output': output_dir,
+        'extract': extract_dir,
+        'tmp': tmp_path
+    }
+
+
+@pytest.fixture
+def sample_package(temp_dirs):
+    """Create a sample package with triplets"""
+    package_dir = temp_dirs['package']
+    
+    # Create sample files (triplets)
+    for i in range(1, 6):
+        seq = f"{i:08d}"
+        (package_dir / f"{seq}.tif").write_text(f"TIFF content {i}")
+        (package_dir / f"{seq}.txt").write_text(f"Text OCR {i}")
+        (package_dir / f"{seq}.html").write_text(f"hOCR {i}")
+    
+    # Create meta.yml and checksum.md5
+    (package_dir / "meta.yml").write_text("capture_date: 2025-09-30")
+    (package_dir / "checksum.md5").write_text("abc123  00000001.tif")
+    
+    return package_dir
+
+
+def test_create_zip_basic(temp_dirs, sample_package):
+    """Test basic ZIP creation from package"""
+    packager = ZIPPackager(temp_dirs['output'])
+    
+    zip_path = packager.create_zip_archive(sample_package, "39015012345678")
+    
+    assert zip_path is not None
+    assert zip_path.exists()
+    assert zip_path.name == "39015012345678.zip"
+    assert zip_path.stat().st_size > 0
+
+
+def test_zip_naming_convention(temp_dirs, sample_package):
+    """Test ZIP filename matches volume identifier"""
+    packager = ZIPPackager(temp_dirs['output'])
+    
+    # Test barcode identifier
+    zip_path = packager.create_zip_archive(sample_package, "39015012345678")
+    assert zip_path.name == "39015012345678.zip"
+    
+    # Test ARK identifier
+    zip_path2 = packager.create_zip_archive(sample_package, "ark_12345_abc123")
+    assert zip_path2.name == "ark_12345_abc123.zip"
+
+
+def test_flat_structure(temp_dirs, sample_package):
+    """Test ZIP contains flat structure (no subdirectories)"""
+    packager = ZIPPackager(temp_dirs['output'])
+    zip_path = packager.create_zip_archive(sample_package, "test_volume")
+    
+    # Verify no paths contain directory separators
+    with zipfile.ZipFile(zip_path, 'r') as zf:
+        for name in zf.namelist():
+            assert '/' not in name, f"Found path with directory: {name}"
+            assert '\\' not in name, f"Found path with directory: {name}"
+
+
+def test_file_count_match(temp_dirs, sample_package):
+    """Test all package files are included in ZIP"""
+    packager = ZIPPackager(temp_dirs['output'])
+    
+    # Count source files
+    source_files = [f for f in sample_package.iterdir() if f.is_file()]
+    source_count = len(source_files)
+    
+    zip_path = packager.create_zip_archive(sample_package, "test_volume")
+    
+    # Count files in ZIP
+    with zipfile.ZipFile(zip_path, 'r') as zf:
+        zip_count = len(zf.namelist())
+    
+    assert zip_count == source_count
+
+
+def test_zip_integrity(temp_dirs, sample_package):
+    """Test ZIP file integrity"""
+    packager = ZIPPackager(temp_dirs['output'])
+    zip_path = packager.create_zip_archive(sample_package, "test_volume")
+    
+    # ZIP should pass integrity check
+    with zipfile.ZipFile(zip_path, 'r') as zf:
+        corrupt_file = zf.testzip()
+        assert corrupt_file is None, f"ZIP integrity check failed: {corrupt_file}"
+
+
+def test_verify_valid_zip(temp_dirs, sample_package):
+    """Test validation of valid ZIP structure"""
+    packager = ZIPPackager(temp_dirs['output'])
+    zip_path = packager.create_zip_archive(sample_package, "test_volume")
+    
+    result = packager.verify_zip_structure(zip_path)
+    
+    assert result.is_valid
+    assert result.file_count > 0
+    assert not result.has_subdirectories
+    assert len(result.errors) == 0
+
+
+def test_detect_subdirectories(temp_dirs):
+    """Test validation detects subdirectories in ZIP"""
+    # Create ZIP with subdirectory structure
+    bad_zip = temp_dirs['output'] / "bad_structure.zip"
+    
+    with zipfile.ZipFile(bad_zip, 'w') as zf:
+        zf.writestr("subdir/file.txt", "content")
+        zf.writestr("file.txt", "content")
+    
+    packager = ZIPPackager(temp_dirs['output'])
+    result = packager.verify_zip_structure(bad_zip)
+    
+    assert not result.is_valid
+    assert result.has_subdirectories
+    assert len(result.errors) > 0
+
+
+def test_macosx_filtering(temp_dirs):
+    """Test macOS metadata files are handled"""
+    package_dir = temp_dirs['package']
+    
+    # Create files including macOS metadata
+    (package_dir / "00000001.tif").write_text("content")
+    (package_dir / "._00000001.tif").write_text("macOS metadata")
+    (package_dir / ".DS_Store").write_text("macOS DS_Store")
+    
+    packager = ZIPPackager(temp_dirs['output'])
+    zip_path = packager.create_zip_archive(package_dir, "test_volume")
+    
+    # macOS files should be skipped
+    with zipfile.ZipFile(zip_path, 'r') as zf:
+        names = zf.namelist()
+        assert "00000001.tif" in names
+        assert "._00000001.tif" not in names
+        assert ".DS_Store" not in names
+
+
+
+def test_list_contents(temp_dirs, sample_package):
+    """Test listing ZIP contents"""
+    packager = ZIPPackager(temp_dirs['output'])
+    zip_path = packager.create_zip_archive(sample_package, "test_volume")
+    
+    contents = packager.list_zip_contents(zip_path)
+    
+    assert isinstance(contents, list)
+    assert len(contents) > 0
+    assert "00000001.tif" in contents
+    assert "meta.yml" in contents
+    # Contents should be sorted
+    assert contents == sorted(contents)
+
+
+def test_extract_functionality(temp_dirs, sample_package):
+    """Test ZIP extraction"""
+    packager = ZIPPackager(temp_dirs['output'])
+    zip_path = packager.create_zip_archive(sample_package, "test_volume")
+    
+    success = packager.extract_zip(zip_path, temp_dirs['extract'])
+    
+    assert success
+    assert temp_dirs['extract'].exists()
+    
+    # Check extracted files
+    extracted_files = list(temp_dirs['extract'].iterdir())
+    assert len(extracted_files) > 0
+    assert (temp_dirs['extract'] / "00000001.tif").exists()
+    assert (temp_dirs['extract'] / "meta.yml").exists()
+
+
+def test_large_package(temp_dirs):
+    """Test handling package with many files"""
+    package_dir = temp_dirs['package']
+    
+    # Create 100 triplets
+    for i in range(1, 101):
+        seq = f"{i:08d}"
+        (package_dir / f"{seq}.tif").write_text(f"TIFF {i}")
+        (package_dir / f"{seq}.txt").write_text(f"Text {i}")
+        (package_dir / f"{seq}.html").write_text(f"hOCR {i}")
+    
+    (package_dir / "meta.yml").write_text("metadata")
+    (package_dir / "checksum.md5").write_text("checksums")
+    
+    packager = ZIPPackager(temp_dirs['output'])
+    zip_path = packager.create_zip_archive(package_dir, "large_volume")
+    
+    assert zip_path is not None
+    
+    with zipfile.ZipFile(zip_path, 'r') as zf:
+        # 100 triplets + 2 metadata files = 302 files
+        assert len(zf.namelist()) == 302
+
+
+def test_empty_package_error(temp_dirs):
+    """Test error handling for empty package directory"""
+    empty_dir = temp_dirs['package']
+    packager = ZIPPackager(temp_dirs['output'])
+    
+    with pytest.raises(ValueError, match="empty"):
+        packager.create_zip_archive(empty_dir, "test_volume")
+
+
+def test_missing_package_error(temp_dirs):
+    """Test error handling for non-existent package"""
+    missing_dir = temp_dirs['tmp'] / "nonexistent"
+    packager = ZIPPackager(temp_dirs['output'])
+    
+    with pytest.raises(FileNotFoundError):
+        packager.create_zip_archive(missing_dir, "test_volume")
+
+
+def test_verify_missing_zip(temp_dirs):
+    """Test validation of non-existent ZIP"""
+    packager = ZIPPackager(temp_dirs['output'])
+    missing_zip = temp_dirs['output'] / "missing.zip"
+    
+    result = packager.verify_zip_structure(missing_zip)
+    
+    assert not result.is_valid
+    assert len(result.errors) > 0
+    assert "not found" in result.errors[0].lower()
+
+
+def test_convenience_function(temp_dirs, sample_package):
+    """Test convenience function for ZIP creation"""
+    zip_path = create_package_zip(
+        sample_package,
+        "39015012345678",
+        temp_dirs['output']
+    )
+    
+    assert zip_path is not None
+    assert zip_path.exists()
+    assert zip_path.name == "39015012345678.zip"