From 9f0cf7630436c86df9be779a8563fbe3789651a3 Mon Sep 17 00:00:00 2001 From: schipp0 Date: Tue, 30 Sep 2025 17:37:53 +0000 Subject: [PATCH] Add Step 4: File Validation & Naming Convention Implements HathiTrust's 8-digit sequential naming standard and file validation to ensure compliance before package assembly. New components: - file_validator.py: Core validation and standardization module * FileValidator class with dry-run support * format_sequence_number(): Converts to 8-digit zero-padded format * validate_single_file(): Validates and renames individual files * validate_file_list(): Batch validation with statistics * verify_sequential_naming(): Detects gaps in sequences * verify_matching_triplets(): Ensures TIFF/TXT/HTML sets match - test_file_validator.py: Comprehensive test suite (8 tests) * Tests formatting, extraction, validation, gap detection * Tests triplet matching for complete file sets * All tests passing - DEMO_step4.md: Usage examples and documentation Features: - Enforces 8-digit zero-padded sequential naming (00000001.tif) - Detects and reports gaps in file sequences - Automatic file renaming to HathiTrust standard - Dry-run mode for safe preview before changes - Verify-only mode for validation without modifications - Case-insensitive extension handling - Detailed error reporting with FileValidationResult dataclass CLI usage: python3 file_validator.py [--extension tif] [--dry-run] [--verify-only] Updated README.md with Step 4 documentation. Progress: Steps 1-4 complete (40% of pipeline) --- DEMO_step4.md | 96 +++++++++++ README.md | 25 ++- file_validator.py | 365 +++++++++++++++++++++++++++++++++++++++++ test_file_validator.py | 161 ++++++++++++++++++ 4 files changed, 646 insertions(+), 1 deletion(-) create mode 100644 DEMO_step4.md create mode 100755 file_validator.py create mode 100644 test_file_validator.py diff --git a/DEMO_step4.md b/DEMO_step4.md new file mode 100644 index 0000000..7376d01 --- /dev/null +++ b/DEMO_step4.md @@ -0,0 +1,96 @@ +## Step 4: File Validation & Naming Convention - DEMO + +### Purpose +Ensures all files follow HathiTrust's strict 8-digit sequential naming convention: +- Format: `00000001.tif`, `00000001.txt`, `00000001.html` +- Sequential: No gaps allowed (1, 2, 3... not 1, 2, 4) +- Zero-padded: Always 8 digits + +### Test the Validator + +#### 1. Verify properly named files: +```bash +cd /home/schipp0/Digitization/HathiTrust + +# Check if files are properly named (no changes) +python3 file_validator.py temp/39015012345678 --verify-only +``` + +Expected output: +``` +✓ All files are properly named and sequential +``` + +#### 2. Validate and standardize files (dry run): +```bash +# See what would be renamed without actually renaming +python3 file_validator.py input/ --extension tif --dry-run +``` + +#### 3. Actually rename files to standard format: +```bash +# Rename files to match HathiTrust convention +python3 file_validator.py input/ --extension tif +``` + +Expected output: +``` +============================================================ +VALIDATION SUMMARY +============================================================ +Total files: 3 +Valid: 3 +Renamed: 3 +Errors: 0 + +✓ All files validated successfully +``` + +### Programmatic Usage + +```python +from pathlib import Path +from file_validator import FileValidator + +# Initialize validator +validator = FileValidator(dry_run=False) + +# Validate a list of files +files = sorted(Path("input").glob("*.tif")) +results = validator.validate_file_list(files, start_sequence=1) + +print(f"Valid: {results['valid']}/{results['total']}") +print(f"Renamed: {results['renamed']}") + +# Verify sequential naming +is_valid, error = FileValidator.verify_sequential_naming(files) +if not is_valid: + print(f"Error: {error}") + +# Verify matching triplets (TIFF + TXT + HTML) +tiff_files = sorted(Path("package").glob("*.tif")) +txt_files = sorted(Path("package").glob("*.txt")) +html_files = sorted(Path("package").glob("*.html")) + +is_valid, error = FileValidator.verify_matching_triplets( + tiff_files, txt_files, html_files +) +if not is_valid: + print(f"Triplet mismatch: {error}") +``` + +### Run Tests +```bash +python3 test_file_validator.py -v +``` + +All 8 tests should pass ✓ + +### Key Features +- ✅ Validates 8-digit zero-padded format +- ✅ Detects gaps in sequences +- ✅ Renames files to standard format +- ✅ Dry-run mode for safe testing +- ✅ Verifies TIFF/TXT/HTML triplet matching +- ✅ Handles case-insensitive extensions +- ✅ Detailed error reporting diff --git a/README.md b/README.md index b9ad3f8..eecae9d 100644 --- a/README.md +++ b/README.md @@ -148,8 +148,31 @@ python3 ocr_processor.py input/ --language fra --output-dir /tmp/ocr python3 test_ocr_processor.py ``` +### ✅ Step 4: File Validation & Naming Convention +- File validator module (`file_validator.py`) +- 8-digit zero-padded sequential naming enforcement +- Gap detection in sequences +- Automatic file renaming to HathiTrust standard +- TIFF/TXT/HTML triplet verification +- Dry-run mode for safe testing +- Test suite with 8 passing tests + +**Usage:** +```bash +# Verify files are properly named +python3 file_validator.py temp/39015012345678 --verify-only + +# Validate and rename files (dry-run) +python3 file_validator.py input/ --extension tif --dry-run + +# Actually rename files +python3 file_validator.py input/ --extension tif + +# Run tests +python3 test_file_validator.py +``` + ### 🔄 Next Steps -- Step 4: File Validation & Naming Convention - Step 5: YAML Metadata Generation - Step 6: MD5 Checksum Generation - Step 7: Package Assembly diff --git a/file_validator.py b/file_validator.py new file mode 100755 index 0000000..47eaf83 --- /dev/null +++ b/file_validator.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +""" +File Validation & Naming Convention +Ensures all files follow HathiTrust 8-digit sequential naming standard +""" + +import re +import logging +from pathlib import Path +from typing import List, Dict, Tuple, Optional +from dataclasses import dataclass + + +@dataclass +class FileValidationResult: + """Result of file validation and standardization""" + original_path: Path + validated_path: Path + sequence_number: int + renamed: bool + valid: bool + error: Optional[str] = None + + +class FileValidator: + """Validates and standardizes filenames for HathiTrust compliance""" + + # HathiTrust requires 8-digit sequential naming + SEQUENCE_PATTERN = re.compile(r'^(\d{8})\.(tif|txt|html|xml)$', re.IGNORECASE) + EXPECTED_DIGITS = 8 + + def __init__(self, dry_run: bool = False): + """ + Initialize file validator + + Args: + dry_run: If True, don't actually rename files, just report what would happen + """ + self.dry_run = dry_run + + @staticmethod + def format_sequence_number(num: int) -> str: + """ + Convert integer to 8-digit string with zero padding + + Args: + num: Sequence number (1-based) + + Returns: + 8-digit zero-padded string (e.g., "00000001") + """ + if num < 1 or num > 99999999: + raise ValueError(f"Sequence number {num} out of valid range (1-99999999)") + + return str(num).zfill(8) + + @staticmethod + def extract_sequence_from_filename(filename: str) -> Optional[int]: + """ + Extract sequence number from a filename + + Args: + filename: Filename to parse + + Returns: + Sequence number as integer, or None if not found + """ + match = FileValidator.SEQUENCE_PATTERN.match(filename) + if match: + return int(match.group(1)) + return None + + @staticmethod + def is_valid_filename(filename: str) -> bool: + """ + Check if filename follows HathiTrust naming convention + + Args: + filename: Filename to validate + + Returns: + True if valid, False otherwise + """ + return FileValidator.SEQUENCE_PATTERN.match(filename) is not None + + def generate_expected_filename(self, sequence: int, extension: str) -> str: + """ + Generate the expected filename for a sequence number + + Args: + sequence: Sequence number (1-based) + extension: File extension (with or without dot) + + Returns: + Expected filename (e.g., "00000001.tif") + """ + # Normalize extension + if not extension.startswith('.'): + extension = f'.{extension}' + + sequence_str = self.format_sequence_number(sequence) + return f"{sequence_str}{extension}" + + def rename_file(self, file_path: Path, new_filename: str) -> Path: + """ + Rename a file to match HathiTrust naming convention + + Args: + file_path: Current file path + new_filename: New filename to use + + Returns: + Path to renamed file + """ + new_path = file_path.parent / new_filename + + if self.dry_run: + logging.info(f"[DRY RUN] Would rename: {file_path.name} → {new_filename}") + return new_path + + if new_path.exists() and new_path != file_path: + raise FileExistsError(f"Cannot rename: {new_filename} already exists") + + file_path.rename(new_path) + logging.info(f"Renamed: {file_path.name} → {new_filename}") + + return new_path + + def validate_single_file(self, file_path: Path, expected_sequence: int) -> FileValidationResult: + """ + Validate and optionally rename a single file + + Args: + file_path: Path to file to validate + expected_sequence: Expected sequence number for this file + + Returns: + FileValidationResult object + """ + result = FileValidationResult( + original_path=file_path, + validated_path=file_path, + sequence_number=expected_sequence, + renamed=False, + valid=False + ) + + try: + current_filename = file_path.name + extension = file_path.suffix + + # Generate expected filename + expected_filename = self.generate_expected_filename(expected_sequence, extension) + + # Check if current filename matches expected + if current_filename == expected_filename: + result.valid = True + result.validated_path = file_path + logging.debug(f"✓ Valid: {current_filename}") + else: + # Need to rename + logging.info(f"Standardizing: {current_filename} → {expected_filename}") + new_path = self.rename_file(file_path, expected_filename) + result.validated_path = new_path + result.renamed = True + result.valid = True + + except Exception as e: + result.valid = False + result.error = str(e) + logging.error(f"Validation failed for {file_path.name}: {e}") + + return result + + def validate_file_list(self, files: List[Path], start_sequence: int = 1) -> Dict[str, any]: + """ + Validate and standardize a list of files + + Args: + files: List of file paths to validate (should be pre-sorted) + start_sequence: Starting sequence number (default: 1) + + Returns: + Dictionary with validation results and statistics + """ + logging.info(f"Validating {len(files)} files starting at sequence {start_sequence}") + + results = { + 'files': [], + 'validated_paths': [], + 'total': len(files), + 'renamed': 0, + 'valid': 0, + 'errors': [] + } + + for i, file_path in enumerate(files, start=start_sequence): + result = self.validate_single_file(file_path, i) + results['files'].append(result) + + if result.valid: + results['validated_paths'].append(result.validated_path) + results['valid'] += 1 + if result.renamed: + results['renamed'] += 1 + else: + results['errors'].append({ + 'file': file_path, + 'sequence': i, + 'error': result.error + }) + + logging.info(f"Validation complete: {results['valid']}/{results['total']} valid, " + f"{results['renamed']} renamed") + + if results['errors']: + logging.warning(f"{len(results['errors'])} validation errors") + + return results + + @staticmethod + def verify_sequential_naming(files: List[Path]) -> Tuple[bool, Optional[str]]: + """ + Verify that files are sequentially numbered with no gaps + + Args: + files: List of file paths (assumed to be sorted) + + Returns: + Tuple of (is_valid, error_message) + """ + if not files: + return False, "No files provided" + + sequences = [] + for file_path in files: + seq = FileValidator.extract_sequence_from_filename(file_path.name) + if seq is None: + return False, f"Invalid filename format: {file_path.name}" + sequences.append(seq) + + # Check starts at 1 + if sequences[0] != 1: + return False, f"First file should be 00000001, found {sequences[0]:08d}" + + # Check for gaps + for i in range(len(sequences) - 1): + if sequences[i + 1] != sequences[i] + 1: + return False, f"Gap in sequence: {sequences[i]:08d} → {sequences[i+1]:08d}" + + return True, None + + @staticmethod + def verify_matching_triplets(tiff_files: List[Path], txt_files: List[Path], + html_files: List[Path]) -> Tuple[bool, Optional[str]]: + """ + Verify that TIFF, TXT, and HTML files have matching sequences + + Args: + tiff_files: List of TIFF file paths + txt_files: List of TXT file paths + html_files: List of HTML file paths + + Returns: + Tuple of (is_valid, error_message) + """ + # Extract sequence numbers + tiff_seqs = set(FileValidator.extract_sequence_from_filename(f.name) for f in tiff_files) + txt_seqs = set(FileValidator.extract_sequence_from_filename(f.name) for f in txt_files) + html_seqs = set(FileValidator.extract_sequence_from_filename(f.name) for f in html_files) + + # Remove None values + tiff_seqs.discard(None) + txt_seqs.discard(None) + html_seqs.discard(None) + + # Check counts match + if len(tiff_seqs) != len(txt_seqs) or len(tiff_seqs) != len(html_seqs): + return False, f"Mismatch in file counts: {len(tiff_seqs)} TIFF, {len(txt_seqs)} TXT, {len(html_seqs)} HTML" + + # Check all sequences match + if tiff_seqs != txt_seqs or tiff_seqs != html_seqs: + missing_txt = tiff_seqs - txt_seqs + missing_html = tiff_seqs - html_seqs + errors = [] + if missing_txt: + errors.append(f"Missing TXT files for: {sorted(missing_txt)}") + if missing_html: + errors.append(f"Missing HTML files for: {sorted(missing_html)}") + return False, "; ".join(errors) + + return True, None + + +# Demo/Testing functionality +if __name__ == "__main__": + import argparse + + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' + ) + + parser = argparse.ArgumentParser(description='Validate and standardize file naming') + parser.add_argument('directory', help='Directory containing files to validate') + parser.add_argument('--extension', default='tif', + help='File extension to validate (default: tif)') + parser.add_argument('--dry-run', action='store_true', + help='Show what would be renamed without actually renaming') + parser.add_argument('--verify-only', action='store_true', + help='Only verify naming, do not rename') + + args = parser.parse_args() + + try: + directory = Path(args.directory) + if not directory.exists(): + logging.error(f"Directory not found: {directory}") + exit(1) + + # Find files with specified extension + files = sorted(directory.glob(f"*.{args.extension}")) + + if not files: + logging.warning(f"No .{args.extension} files found in {directory}") + exit(0) + + logging.info(f"Found {len(files)} .{args.extension} files") + + # Verify only mode + if args.verify_only: + is_valid, error = FileValidator.verify_sequential_naming(files) + if is_valid: + print("✓ All files are properly named and sequential") + else: + print(f"✗ Validation failed: {error}") + exit(1) + else: + # Validate and standardize + validator = FileValidator(dry_run=args.dry_run) + results = validator.validate_file_list(files) + + # Print summary + print(f"\n{'='*60}") + print("VALIDATION SUMMARY") + print(f"{'='*60}") + print(f"Total files: {results['total']}") + print(f"Valid: {results['valid']}") + print(f"Renamed: {results['renamed']}") + print(f"Errors: {len(results['errors'])}") + + if results['errors']: + print(f"\n❌ Errors:") + for error in results['errors']: + print(f" {error['file'].name}: {error['error']}") + + if results['valid'] == results['total']: + print(f"\n✓ All files validated successfully") + else: + print(f"\n⚠ Some files failed validation") + exit(1) + + except Exception as e: + logging.error(f"Error: {e}") + exit(1) diff --git a/test_file_validator.py b/test_file_validator.py new file mode 100644 index 0000000..8aff976 --- /dev/null +++ b/test_file_validator.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +Unit tests for file_validator module +""" + +import unittest +import tempfile +import shutil +from pathlib import Path +from file_validator import FileValidator, FileValidationResult + + +class TestFileValidator(unittest.TestCase): + + @classmethod + def setUpClass(cls): + """Set up test fixtures""" + cls.temp_dir = Path(tempfile.mkdtemp()) + + # Create test files with various naming patterns + cls.test_files = { + 'valid': cls.temp_dir / '00000001.tif', + 'valid_upper': cls.temp_dir / '00000002.TIF', + 'wrong_digits': cls.temp_dir / '123.tif', + 'no_leading_zeros': cls.temp_dir / '3.tif', + } + + for file_path in cls.test_files.values(): + file_path.touch() + + @classmethod + def tearDownClass(cls): + """Clean up test directory""" + if cls.temp_dir.exists(): + shutil.rmtree(cls.temp_dir) + + def test_format_sequence_number(self): + """Test sequence number formatting""" + self.assertEqual(FileValidator.format_sequence_number(1), "00000001") + self.assertEqual(FileValidator.format_sequence_number(23), "00000023") + self.assertEqual(FileValidator.format_sequence_number(456), "00000456") + self.assertEqual(FileValidator.format_sequence_number(99999999), "99999999") + + # Test invalid ranges + with self.assertRaises(ValueError): + FileValidator.format_sequence_number(0) + with self.assertRaises(ValueError): + FileValidator.format_sequence_number(100000000) + + def test_extract_sequence_from_filename(self): + """Test sequence extraction from filenames""" + self.assertEqual(FileValidator.extract_sequence_from_filename("00000001.tif"), 1) + self.assertEqual(FileValidator.extract_sequence_from_filename("00000023.txt"), 23) + self.assertEqual(FileValidator.extract_sequence_from_filename("00000456.html"), 456) + self.assertEqual(FileValidator.extract_sequence_from_filename("99999999.TIF"), 99999999) + + # Invalid formats + self.assertIsNone(FileValidator.extract_sequence_from_filename("123.tif")) + self.assertIsNone(FileValidator.extract_sequence_from_filename("test_00000001.tif")) + + def test_is_valid_filename(self): + """Test filename validation""" + self.assertTrue(FileValidator.is_valid_filename("00000001.tif")) + self.assertTrue(FileValidator.is_valid_filename("00000023.txt")) + self.assertTrue(FileValidator.is_valid_filename("00000456.html")) + self.assertTrue(FileValidator.is_valid_filename("00000001.TIF")) # Case insensitive + + self.assertFalse(FileValidator.is_valid_filename("123.tif")) + self.assertFalse(FileValidator.is_valid_filename("test_00000001.tif")) + self.assertFalse(FileValidator.is_valid_filename("00000001")) # No extension + + def test_generate_expected_filename(self): + """Test expected filename generation""" + validator = FileValidator() + + self.assertEqual(validator.generate_expected_filename(1, ".tif"), "00000001.tif") + self.assertEqual(validator.generate_expected_filename(23, "txt"), "00000023.txt") + self.assertEqual(validator.generate_expected_filename(456, ".html"), "00000456.html") + + def test_validate_single_file_valid(self): + """Test validation of a properly named file""" + validator = FileValidator(dry_run=True) + result = validator.validate_single_file(self.test_files['valid'], 1) + + self.assertTrue(result.valid) + self.assertFalse(result.renamed) + self.assertEqual(result.sequence_number, 1) + + def test_verify_sequential_naming(self): + """Test sequential naming verification""" + # Create properly named files + test_dir = self.temp_dir / "sequential_test" + test_dir.mkdir(exist_ok=True) + + files = [] + for i in range(1, 4): + f = test_dir / f"{i:08d}.tif" + f.touch() + files.append(f) + + is_valid, error = FileValidator.verify_sequential_naming(files) + self.assertTrue(is_valid) + self.assertIsNone(error) + + # Clean up + shutil.rmtree(test_dir) + + def test_verify_sequential_naming_with_gap(self): + """Test detection of gaps in sequence""" + test_dir = self.temp_dir / "gap_test" + test_dir.mkdir(exist_ok=True) + + # Create files with gap (1, 2, 4 - missing 3) + files = [] + for i in [1, 2, 4]: + f = test_dir / f"{i:08d}.tif" + f.touch() + files.append(f) + + is_valid, error = FileValidator.verify_sequential_naming(files) + self.assertFalse(is_valid) + self.assertIn("Gap in sequence", error) + + # Clean up + shutil.rmtree(test_dir) + + def test_verify_matching_triplets(self): + """Test verification of matching TIFF/TXT/HTML sets""" + test_dir = self.temp_dir / "triplet_test" + test_dir.mkdir(exist_ok=True) + + # Create matching triplets + tiff_files = [] + txt_files = [] + html_files = [] + + for i in range(1, 4): + tiff = test_dir / f"{i:08d}.tif" + txt = test_dir / f"{i:08d}.txt" + html = test_dir / f"{i:08d}.html" + + tiff.touch() + txt.touch() + html.touch() + + tiff_files.append(tiff) + txt_files.append(txt) + html_files.append(html) + + is_valid, error = FileValidator.verify_matching_triplets( + tiff_files, txt_files, html_files + ) + self.assertTrue(is_valid) + self.assertIsNone(error) + + # Clean up + shutil.rmtree(test_dir) + + +if __name__ == "__main__": + unittest.main()