import pandas as pd
import os
import cv2  # OpenCV for image handling

# ----------------------------------------------
#  Task 1: Get data
#       - load data from files
#       - validate the existence of all images loaded in folder
#       - creates numpy array to store data attributes
# ----------------------------------------------

# Define your file paths
csv_path = 'data/data/train.csv'
image_folder_path = './data/data/' # <-- UPDATE THIS PATH!

# Load the metadata
try:
    df = pd.read_csv(csv_path)
    print("Metadata loaded successfully.")
except FileNotFoundError:
    print("Error: train.csv not found.")
    exit()


def check_image_paths(df, image_folder_path):
    """Checks if all required images exist in the target folder."""
    print("\n--- Running Image Path Check ---")
    
    # Get a set of all names actually present in the folder
    available_files = set(os.listdir(image_folder_path))
    
    # The CSV names (must be standardized, e.g., convert to lowercase)
    required_names = set(df['img_name'])
    
    # Check for missing files
    missing_files = required_names - available_files
    
    if missing_files:
        print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
        # You might want to filter the DataFrame to only use the rows that have images
        df = df[~df['img_name'].isin(missing_files)]
        print(f"Cleaned DataFrame size: {len(df)}")
    else:
        print("✅ All required images were found in the directory.")
    
    return df

# ---> Execute the check
df = check_image_paths(df, image_folder_path)


def load_image_data(df, image_folder_path):
    """Loads images and returns a list of (image_data, feature_data) tuples."""
    
    processed_data = []
    
    print("\n--- Loading Images and Features (This may take time) ---")
    
    for index, row in df.iterrows():
        img_name = row['img_name']
        
        # Construct the full, absolute path
        full_path = os.path.join(image_folder_path, img_name)
        
        try:
            # Load the image using OpenCV
            image = cv2.imread(full_path)
            
            if image is None:
                print(f"Skipping row {index}: Could not load image at {full_path}")
                continue
                
            # Extract the pollutant/feature metadata
            features = {
                'ozone': row['ozone'],
                'NO2': row['NO2'],
                'AOT': row['AOT'],
                'elevation': row['elevation']
            }
            
            # Store the combination
            processed_data.append({
                'image': image,  # The actual image NumPy array
                'metadata': features
            })
            
        except Exception as e:
            print(f"An error occurred processing row {index}: {e}")
            
    print("✅ Data loading complete.")
    return processed_data

# ---> Execute the full data loading
combined_dataset = load_image_data(df, image_folder_path)