preprocessing.py

# -*- coding: utf-8 -*-
"""preprocessing.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1zHp3b8TUG2thkU3npcljql5mBDNpk63H
"""

import numpy as np
import cv2
import os
from PIL import Image
import zipfile
import shutil

from google.colab import drive
drive.mount('/content/drive')

# download train.zip file
zip_path = '/content/drive/My Drive/train.zip'
extract_to = '/content/train/'

os.makedirs(extract_to, exist_ok=True)

# unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Files extracted successfully!")

def cropImg(imgPath):
    """
    Crop the image to only include the face detected using OpenCV's Haar Cascades.

    Parameters:
        imgPath (str): The path to the input image file.

    Returns:
        np.array: Cropped face area as a numpy array in BGR format, or the original image if no face is detected.
    """
    faceCascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    try:
        image = Image.open(imgPath).convert('RGB')
    except Exception as e:
        print(f"PIL error: {e}")
        return None

    # Convert PIL Image to numpy array for OpenCV to process
    open_cv_image = np.array(image)
    # Convert RGB to BGR for OpenCV
    open_cv_image = open_cv_image[:, :, ::-1].copy()

    gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
    faces = faceCascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    if len(faces) > 0:
        x, y, w, h = faces[0]
        face = open_cv_image[y:y+h, x:x+w]
        return face  # Return numpy array in BGR format
    else:
        return open_cv_image  # Return the original image in BGR format as a numpy array

def normalize_data(data):
    """
    Normalize the input image data from BGR format to a 0-1 range in RGB format, improving model performance.

    Parameters:
        data (np.array): Input BGR image data to be normalized.

    Returns:
        np.array: Normalized data array with values between 0 and 1 in RGB format.
    """
    data = data[:, :, ::-1]  # Convert BGR to RGB
    return data / 255.0

def process_images(image_folder, output_folder):
    """
    Process each image in the specified folder: crop to face, normalize, and save to output folder.

    Parameters:
        image_folder (str): Folder containing the images to process.
        output_folder (str): Folder to save processed images.
    """
    count = 0
    # Ensure the output directory exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(image_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):  # Check for image files
            file_path = os.path.join(image_folder, filename)
            cropped_face = cropImg(file_path)

            if cropped_face is not None:
                normalized_image = normalize_data(cropped_face)
                output_path = os.path.join(output_folder, filename)
                cv2.imwrite(output_path, normalized_image * 255)  # Convert back to 0-255 range
                count += 1
                if count % 100 == 1:
                    print(f"Processed {count} images.")
            else:
                print(f"Skipping file due to loading error: {filename}")

    print("Processing complete.")

# Paths for the image directories
image_folder = '/content/train/train'
output_folder = '/content/processed'

# Run the processing function
process_images(image_folder, output_folder)

# compress the processed train data to zip file
!zip -r /content/processed.zip /content/processed

# upload the processed train data to google drive to avoid running preprocessing step again
source = '/content/processed.zip'
destination= '/content/drive/MyDrive/processed.zip'
shutil.move(source, destination)