From 9f0cf7630436c86df9be779a8563fbe3789651a3 Mon Sep 17 00:00:00 2001
From: schipp0 <schipp0@purdue.edu>
Date: Tue, 30 Sep 2025 17:37:53 +0000
Subject: [PATCH] Add Step 4: File Validation & Naming Convention

Implements HathiTrust's 8-digit sequential naming standard and file
validation to ensure compliance before package assembly.

New components:
- file_validator.py: Core validation and standardization module
  * FileValidator class with dry-run support
  * format_sequence_number(): Converts to 8-digit zero-padded format
  * validate_single_file(): Validates and renames individual files
  * validate_file_list(): Batch validation with statistics
  * verify_sequential_naming(): Detects gaps in sequences
  * verify_matching_triplets(): Ensures TIFF/TXT/HTML sets match

- test_file_validator.py: Comprehensive test suite (8 tests)
  * Tests formatting, extraction, validation, gap detection
  * Tests triplet matching for complete file sets
  * All tests passing

- DEMO_step4.md: Usage examples and documentation

Features:
- Enforces 8-digit zero-padded sequential naming (00000001.tif)
- Detects and reports gaps in file sequences
- Automatic file renaming to HathiTrust standard
- Dry-run mode for safe preview before changes
- Verify-only mode for validation without modifications
- Case-insensitive extension handling
- Detailed error reporting with FileValidationResult dataclass

CLI usage:
  python3 file_validator.py <directory> [--extension tif] [--dry-run] [--verify-only]

Updated README.md with Step 4 documentation.

Progress: Steps 1-4 complete (40% of pipeline)
---
 DEMO_step4.md          |  96 +++++++++++
 README.md              |  25 ++-
 file_validator.py      | 365 +++++++++++++++++++++++++++++++++++++++++
 test_file_validator.py | 161 ++++++++++++++++++
 4 files changed, 646 insertions(+), 1 deletion(-)
 create mode 100644 DEMO_step4.md
 create mode 100755 file_validator.py
 create mode 100644 test_file_validator.py
diff --git a/DEMO_step4.md b/DEMO_step4.md
new file mode 100644
index 0000000..7376d01
--- /dev/null
+++ b/DEMO_step4.md
@@ -0,0 +1,96 @@
+## Step 4: File Validation & Naming Convention - DEMO
+
+### Purpose
+Ensures all files follow HathiTrust's strict 8-digit sequential naming convention:
+- Format: `00000001.tif`, `00000001.txt`, `00000001.html`
+- Sequential: No gaps allowed (1, 2, 3... not 1, 2, 4)
+- Zero-padded: Always 8 digits
+
+### Test the Validator
+
+#### 1. Verify properly named files:
+```bash
+cd /home/schipp0/Digitization/HathiTrust
+
+# Check if files are properly named (no changes)
+python3 file_validator.py temp/39015012345678 --verify-only
+```
+
+Expected output:
+```
+✓ All files are properly named and sequential
+```
+
+#### 2. Validate and standardize files (dry run):
+```bash
+# See what would be renamed without actually renaming
+python3 file_validator.py input/ --extension tif --dry-run
+```
+
+#### 3. Actually rename files to standard format:
+```bash
+# Rename files to match HathiTrust convention
+python3 file_validator.py input/ --extension tif
+```
+
+Expected output:
+```
+============================================================
+VALIDATION SUMMARY
+============================================================
+Total files: 3
+Valid: 3
+Renamed: 3
+Errors: 0
+
+✓ All files validated successfully
+```
+
+### Programmatic Usage
+
+```python
+from pathlib import Path
+from file_validator import FileValidator
+
+# Initialize validator
+validator = FileValidator(dry_run=False)
+
+# Validate a list of files
+files = sorted(Path("input").glob("*.tif"))
+results = validator.validate_file_list(files, start_sequence=1)
+
+print(f"Valid: {results['valid']}/{results['total']}")
+print(f"Renamed: {results['renamed']}")
+
+# Verify sequential naming
+is_valid, error = FileValidator.verify_sequential_naming(files)
+if not is_valid:
+    print(f"Error: {error}")
+
+# Verify matching triplets (TIFF + TXT + HTML)
+tiff_files = sorted(Path("package").glob("*.tif"))
+txt_files = sorted(Path("package").glob("*.txt"))
+html_files = sorted(Path("package").glob("*.html"))
+
+is_valid, error = FileValidator.verify_matching_triplets(
+    tiff_files, txt_files, html_files
+)
+if not is_valid:
+    print(f"Triplet mismatch: {error}")
+```
+
+### Run Tests
+```bash
+python3 test_file_validator.py -v
+```
+
+All 8 tests should pass ✓
+
+### Key Features
+- ✅ Validates 8-digit zero-padded format
+- ✅ Detects gaps in sequences
+- ✅ Renames files to standard format
+- ✅ Dry-run mode for safe testing
+- ✅ Verifies TIFF/TXT/HTML triplet matching
+- ✅ Handles case-insensitive extensions
+- ✅ Detailed error reporting
diff --git a/README.md b/README.md
index b9ad3f8..eecae9d 100644
--- a/README.md
+++ b/README.md
@@ -148,8 +148,31 @@ python3 ocr_processor.py input/ --language fra --output-dir /tmp/ocr
 python3 test_ocr_processor.py
 ```
 
+### ✅ Step 4: File Validation & Naming Convention
+- File validator module (`file_validator.py`)
+- 8-digit zero-padded sequential naming enforcement
+- Gap detection in sequences
+- Automatic file renaming to HathiTrust standard
+- TIFF/TXT/HTML triplet verification
+- Dry-run mode for safe testing
+- Test suite with 8 passing tests
+
+**Usage:**
+```bash
+# Verify files are properly named
+python3 file_validator.py temp/39015012345678 --verify-only
+
+# Validate and rename files (dry-run)
+python3 file_validator.py input/ --extension tif --dry-run
+
+# Actually rename files
+python3 file_validator.py input/ --extension tif
+
+# Run tests
+python3 test_file_validator.py
+```
+
 ### 🔄 Next Steps
-- Step 4: File Validation & Naming Convention
 - Step 5: YAML Metadata Generation
 - Step 6: MD5 Checksum Generation
 - Step 7: Package Assembly
diff --git a/file_validator.py b/file_validator.py
new file mode 100755
index 0000000..47eaf83
--- /dev/null
+++ b/file_validator.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python3
+"""
+File Validation & Naming Convention
+Ensures all files follow HathiTrust 8-digit sequential naming standard
+"""
+
+import re
+import logging
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class FileValidationResult:
+    """Result of file validation and standardization"""
+    original_path: Path
+    validated_path: Path
+    sequence_number: int
+    renamed: bool
+    valid: bool
+    error: Optional[str] = None
+
+
+class FileValidator:
+    """Validates and standardizes filenames for HathiTrust compliance"""
+    
+    # HathiTrust requires 8-digit sequential naming
+    SEQUENCE_PATTERN = re.compile(r'^(\d{8})\.(tif|txt|html|xml)$', re.IGNORECASE)
+    EXPECTED_DIGITS = 8
+    
+    def __init__(self, dry_run: bool = False):
+        """
+        Initialize file validator
+        
+        Args:
+            dry_run: If True, don't actually rename files, just report what would happen
+        """
+        self.dry_run = dry_run
+    
+    @staticmethod
+    def format_sequence_number(num: int) -> str:
+        """
+        Convert integer to 8-digit string with zero padding
+        
+        Args:
+            num: Sequence number (1-based)
+        
+        Returns:
+            8-digit zero-padded string (e.g., "00000001")
+        """
+        if num < 1 or num > 99999999:
+            raise ValueError(f"Sequence number {num} out of valid range (1-99999999)")
+        
+        return str(num).zfill(8)
+    
+    @staticmethod
+    def extract_sequence_from_filename(filename: str) -> Optional[int]:
+        """
+        Extract sequence number from a filename
+        
+        Args:
+            filename: Filename to parse
+        
+        Returns:
+            Sequence number as integer, or None if not found
+        """
+        match = FileValidator.SEQUENCE_PATTERN.match(filename)
+        if match:
+            return int(match.group(1))
+        return None
+    
+    @staticmethod
+    def is_valid_filename(filename: str) -> bool:
+        """
+        Check if filename follows HathiTrust naming convention
+        
+        Args:
+            filename: Filename to validate
+        
+        Returns:
+            True if valid, False otherwise
+        """
+        return FileValidator.SEQUENCE_PATTERN.match(filename) is not None
+    
+    def generate_expected_filename(self, sequence: int, extension: str) -> str:
+        """
+        Generate the expected filename for a sequence number
+        
+        Args:
+            sequence: Sequence number (1-based)
+            extension: File extension (with or without dot)
+        
+        Returns:
+            Expected filename (e.g., "00000001.tif")
+        """
+        # Normalize extension
+        if not extension.startswith('.'):
+            extension = f'.{extension}'
+        
+        sequence_str = self.format_sequence_number(sequence)
+        return f"{sequence_str}{extension}"
+    
+    def rename_file(self, file_path: Path, new_filename: str) -> Path:
+        """
+        Rename a file to match HathiTrust naming convention
+        
+        Args:
+            file_path: Current file path
+            new_filename: New filename to use
+        
+        Returns:
+            Path to renamed file
+        """
+        new_path = file_path.parent / new_filename
+        
+        if self.dry_run:
+            logging.info(f"[DRY RUN] Would rename: {file_path.name} → {new_filename}")
+            return new_path
+        
+        if new_path.exists() and new_path != file_path:
+            raise FileExistsError(f"Cannot rename: {new_filename} already exists")
+        
+        file_path.rename(new_path)
+        logging.info(f"Renamed: {file_path.name} → {new_filename}")
+        
+        return new_path
+    
+    def validate_single_file(self, file_path: Path, expected_sequence: int) -> FileValidationResult:
+        """
+        Validate and optionally rename a single file
+        
+        Args:
+            file_path: Path to file to validate
+            expected_sequence: Expected sequence number for this file
+        
+        Returns:
+            FileValidationResult object
+        """
+        result = FileValidationResult(
+            original_path=file_path,
+            validated_path=file_path,
+            sequence_number=expected_sequence,
+            renamed=False,
+            valid=False
+        )
+        
+        try:
+            current_filename = file_path.name
+            extension = file_path.suffix
+            
+            # Generate expected filename
+            expected_filename = self.generate_expected_filename(expected_sequence, extension)
+            
+            # Check if current filename matches expected
+            if current_filename == expected_filename:
+                result.valid = True
+                result.validated_path = file_path
+                logging.debug(f"✓ Valid: {current_filename}")
+            else:
+                # Need to rename
+                logging.info(f"Standardizing: {current_filename} → {expected_filename}")
+                new_path = self.rename_file(file_path, expected_filename)
+                result.validated_path = new_path
+                result.renamed = True
+                result.valid = True
+        
+        except Exception as e:
+            result.valid = False
+            result.error = str(e)
+            logging.error(f"Validation failed for {file_path.name}: {e}")
+        
+        return result
+    
+    def validate_file_list(self, files: List[Path], start_sequence: int = 1) -> Dict[str, any]:
+        """
+        Validate and standardize a list of files
+        
+        Args:
+            files: List of file paths to validate (should be pre-sorted)
+            start_sequence: Starting sequence number (default: 1)
+        
+        Returns:
+            Dictionary with validation results and statistics
+        """
+        logging.info(f"Validating {len(files)} files starting at sequence {start_sequence}")
+        
+        results = {
+            'files': [],
+            'validated_paths': [],
+            'total': len(files),
+            'renamed': 0,
+            'valid': 0,
+            'errors': []
+        }
+        
+        for i, file_path in enumerate(files, start=start_sequence):
+            result = self.validate_single_file(file_path, i)
+            results['files'].append(result)
+            
+            if result.valid:
+                results['validated_paths'].append(result.validated_path)
+                results['valid'] += 1
+                if result.renamed:
+                    results['renamed'] += 1
+            else:
+                results['errors'].append({
+                    'file': file_path,
+                    'sequence': i,
+                    'error': result.error
+                })
+        
+        logging.info(f"Validation complete: {results['valid']}/{results['total']} valid, "
+                    f"{results['renamed']} renamed")
+        
+        if results['errors']:
+            logging.warning(f"{len(results['errors'])} validation errors")
+        
+        return results
+    
+    @staticmethod
+    def verify_sequential_naming(files: List[Path]) -> Tuple[bool, Optional[str]]:
+        """
+        Verify that files are sequentially numbered with no gaps
+        
+        Args:
+            files: List of file paths (assumed to be sorted)
+        
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        if not files:
+            return False, "No files provided"
+        
+        sequences = []
+        for file_path in files:
+            seq = FileValidator.extract_sequence_from_filename(file_path.name)
+            if seq is None:
+                return False, f"Invalid filename format: {file_path.name}"
+            sequences.append(seq)
+        
+        # Check starts at 1
+        if sequences[0] != 1:
+            return False, f"First file should be 00000001, found {sequences[0]:08d}"
+        
+        # Check for gaps
+        for i in range(len(sequences) - 1):
+            if sequences[i + 1] != sequences[i] + 1:
+                return False, f"Gap in sequence: {sequences[i]:08d} → {sequences[i+1]:08d}"
+        
+        return True, None
+    
+    @staticmethod
+    def verify_matching_triplets(tiff_files: List[Path], txt_files: List[Path], 
+                                 html_files: List[Path]) -> Tuple[bool, Optional[str]]:
+        """
+        Verify that TIFF, TXT, and HTML files have matching sequences
+        
+        Args:
+            tiff_files: List of TIFF file paths
+            txt_files: List of TXT file paths
+            html_files: List of HTML file paths
+        
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        # Extract sequence numbers
+        tiff_seqs = set(FileValidator.extract_sequence_from_filename(f.name) for f in tiff_files)
+        txt_seqs = set(FileValidator.extract_sequence_from_filename(f.name) for f in txt_files)
+        html_seqs = set(FileValidator.extract_sequence_from_filename(f.name) for f in html_files)
+        
+        # Remove None values
+        tiff_seqs.discard(None)
+        txt_seqs.discard(None)
+        html_seqs.discard(None)
+        
+        # Check counts match
+        if len(tiff_seqs) != len(txt_seqs) or len(tiff_seqs) != len(html_seqs):
+            return False, f"Mismatch in file counts: {len(tiff_seqs)} TIFF, {len(txt_seqs)} TXT, {len(html_seqs)} HTML"
+        
+        # Check all sequences match
+        if tiff_seqs != txt_seqs or tiff_seqs != html_seqs:
+            missing_txt = tiff_seqs - txt_seqs
+            missing_html = tiff_seqs - html_seqs
+            errors = []
+            if missing_txt:
+                errors.append(f"Missing TXT files for: {sorted(missing_txt)}")
+            if missing_html:
+                errors.append(f"Missing HTML files for: {sorted(missing_html)}")
+            return False, "; ".join(errors)
+        
+        return True, None
+
+
+# Demo/Testing functionality
+if __name__ == "__main__":
+    import argparse
+    
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+    
+    parser = argparse.ArgumentParser(description='Validate and standardize file naming')
+    parser.add_argument('directory', help='Directory containing files to validate')
+    parser.add_argument('--extension', default='tif', 
+                       help='File extension to validate (default: tif)')
+    parser.add_argument('--dry-run', action='store_true',
+                       help='Show what would be renamed without actually renaming')
+    parser.add_argument('--verify-only', action='store_true',
+                       help='Only verify naming, do not rename')
+    
+    args = parser.parse_args()
+    
+    try:
+        directory = Path(args.directory)
+        if not directory.exists():
+            logging.error(f"Directory not found: {directory}")
+            exit(1)
+        
+        # Find files with specified extension
+        files = sorted(directory.glob(f"*.{args.extension}"))
+        
+        if not files:
+            logging.warning(f"No .{args.extension} files found in {directory}")
+            exit(0)
+        
+        logging.info(f"Found {len(files)} .{args.extension} files")
+        
+        # Verify only mode
+        if args.verify_only:
+            is_valid, error = FileValidator.verify_sequential_naming(files)
+            if is_valid:
+                print("✓ All files are properly named and sequential")
+            else:
+                print(f"✗ Validation failed: {error}")
+                exit(1)
+        else:
+            # Validate and standardize
+            validator = FileValidator(dry_run=args.dry_run)
+            results = validator.validate_file_list(files)
+            
+            # Print summary
+            print(f"\n{'='*60}")
+            print("VALIDATION SUMMARY")
+            print(f"{'='*60}")
+            print(f"Total files: {results['total']}")
+            print(f"Valid: {results['valid']}")
+            print(f"Renamed: {results['renamed']}")
+            print(f"Errors: {len(results['errors'])}")
+            
+            if results['errors']:
+                print(f"\n❌ Errors:")
+                for error in results['errors']:
+                    print(f"  {error['file'].name}: {error['error']}")
+            
+            if results['valid'] == results['total']:
+                print(f"\n✓ All files validated successfully")
+            else:
+                print(f"\n⚠ Some files failed validation")
+                exit(1)
+    
+    except Exception as e:
+        logging.error(f"Error: {e}")
+        exit(1)
diff --git a/test_file_validator.py b/test_file_validator.py
new file mode 100644
index 0000000..8aff976
--- /dev/null
+++ b/test_file_validator.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+"""
+Unit tests for file_validator module
+"""
+
+import unittest
+import tempfile
+import shutil
+from pathlib import Path
+from file_validator import FileValidator, FileValidationResult
+
+
+class TestFileValidator(unittest.TestCase):
+    
+    @classmethod
+    def setUpClass(cls):
+        """Set up test fixtures"""
+        cls.temp_dir = Path(tempfile.mkdtemp())
+        
+        # Create test files with various naming patterns
+        cls.test_files = {
+            'valid': cls.temp_dir / '00000001.tif',
+            'valid_upper': cls.temp_dir / '00000002.TIF',
+            'wrong_digits': cls.temp_dir / '123.tif',
+            'no_leading_zeros': cls.temp_dir / '3.tif',
+        }
+        
+        for file_path in cls.test_files.values():
+            file_path.touch()
+    
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test directory"""
+        if cls.temp_dir.exists():
+            shutil.rmtree(cls.temp_dir)
+    
+    def test_format_sequence_number(self):
+        """Test sequence number formatting"""
+        self.assertEqual(FileValidator.format_sequence_number(1), "00000001")
+        self.assertEqual(FileValidator.format_sequence_number(23), "00000023")
+        self.assertEqual(FileValidator.format_sequence_number(456), "00000456")
+        self.assertEqual(FileValidator.format_sequence_number(99999999), "99999999")
+        
+        # Test invalid ranges
+        with self.assertRaises(ValueError):
+            FileValidator.format_sequence_number(0)
+        with self.assertRaises(ValueError):
+            FileValidator.format_sequence_number(100000000)
+    
+    def test_extract_sequence_from_filename(self):
+        """Test sequence extraction from filenames"""
+        self.assertEqual(FileValidator.extract_sequence_from_filename("00000001.tif"), 1)
+        self.assertEqual(FileValidator.extract_sequence_from_filename("00000023.txt"), 23)
+        self.assertEqual(FileValidator.extract_sequence_from_filename("00000456.html"), 456)
+        self.assertEqual(FileValidator.extract_sequence_from_filename("99999999.TIF"), 99999999)
+        
+        # Invalid formats
+        self.assertIsNone(FileValidator.extract_sequence_from_filename("123.tif"))
+        self.assertIsNone(FileValidator.extract_sequence_from_filename("test_00000001.tif"))
+    
+    def test_is_valid_filename(self):
+        """Test filename validation"""
+        self.assertTrue(FileValidator.is_valid_filename("00000001.tif"))
+        self.assertTrue(FileValidator.is_valid_filename("00000023.txt"))
+        self.assertTrue(FileValidator.is_valid_filename("00000456.html"))
+        self.assertTrue(FileValidator.is_valid_filename("00000001.TIF"))  # Case insensitive
+        
+        self.assertFalse(FileValidator.is_valid_filename("123.tif"))
+        self.assertFalse(FileValidator.is_valid_filename("test_00000001.tif"))
+        self.assertFalse(FileValidator.is_valid_filename("00000001"))  # No extension
+    
+    def test_generate_expected_filename(self):
+        """Test expected filename generation"""
+        validator = FileValidator()
+        
+        self.assertEqual(validator.generate_expected_filename(1, ".tif"), "00000001.tif")
+        self.assertEqual(validator.generate_expected_filename(23, "txt"), "00000023.txt")
+        self.assertEqual(validator.generate_expected_filename(456, ".html"), "00000456.html")
+    
+    def test_validate_single_file_valid(self):
+        """Test validation of a properly named file"""
+        validator = FileValidator(dry_run=True)
+        result = validator.validate_single_file(self.test_files['valid'], 1)
+        
+        self.assertTrue(result.valid)
+        self.assertFalse(result.renamed)
+        self.assertEqual(result.sequence_number, 1)
+    
+    def test_verify_sequential_naming(self):
+        """Test sequential naming verification"""
+        # Create properly named files
+        test_dir = self.temp_dir / "sequential_test"
+        test_dir.mkdir(exist_ok=True)
+        
+        files = []
+        for i in range(1, 4):
+            f = test_dir / f"{i:08d}.tif"
+            f.touch()
+            files.append(f)
+        
+        is_valid, error = FileValidator.verify_sequential_naming(files)
+        self.assertTrue(is_valid)
+        self.assertIsNone(error)
+        
+        # Clean up
+        shutil.rmtree(test_dir)
+    
+    def test_verify_sequential_naming_with_gap(self):
+        """Test detection of gaps in sequence"""
+        test_dir = self.temp_dir / "gap_test"
+        test_dir.mkdir(exist_ok=True)
+        
+        # Create files with gap (1, 2, 4 - missing 3)
+        files = []
+        for i in [1, 2, 4]:
+            f = test_dir / f"{i:08d}.tif"
+            f.touch()
+            files.append(f)
+        
+        is_valid, error = FileValidator.verify_sequential_naming(files)
+        self.assertFalse(is_valid)
+        self.assertIn("Gap in sequence", error)
+        
+        # Clean up
+        shutil.rmtree(test_dir)
+    
+    def test_verify_matching_triplets(self):
+        """Test verification of matching TIFF/TXT/HTML sets"""
+        test_dir = self.temp_dir / "triplet_test"
+        test_dir.mkdir(exist_ok=True)
+        
+        # Create matching triplets
+        tiff_files = []
+        txt_files = []
+        html_files = []
+        
+        for i in range(1, 4):
+            tiff = test_dir / f"{i:08d}.tif"
+            txt = test_dir / f"{i:08d}.txt"
+            html = test_dir / f"{i:08d}.html"
+            
+            tiff.touch()
+            txt.touch()
+            html.touch()
+            
+            tiff_files.append(tiff)
+            txt_files.append(txt)
+            html_files.append(html)
+        
+        is_valid, error = FileValidator.verify_matching_triplets(
+            tiff_files, txt_files, html_files
+        )
+        self.assertTrue(is_valid)
+        self.assertIsNone(error)
+        
+        # Clean up
+        shutil.rmtree(test_dir)
+
+
+if __name__ == "__main__":
+    unittest.main()