import pandas as pd
import os
import cv2  # OpenCV for image handling
import re
import numpy as np 

# ============================================================================
# 🚀 FEATURE ENGINEERING FUNCTION (UPDATED FOR ROBUSTNESS)
# ==========================================================[
def extract_features_from_filename(filename):
    """
    Extracts structured features from the filename using regex and string splitting.
    Handles potential None matches gracefully.
    """
    
    features = {
        'Station_Name': None,
        'Source_Sensor': None,
        'Latitude': None,
        'Longitude': None,
        'Start_Time': None,
        'End_Time': None
    }
    
    # Use the strongest delimiter '__' for splitting
    parts = filename.split('__') 
    
    # --- 1. Station Name, Coordinates, and Sensor ---
    if parts:
        main_segment = parts[0] 
        
        # A. Coordinate Extraction (Most stable regex part)
        # Pattern: (XX-XX)_[YY-ZZ]
        coord_match = re.search(r'(\d{2}-\d{2})_(\d{2}-\d{3})', main_segment)
        
        if coord_match:
            # We can safely use group(1) and group(2) because we checked for coord_match
            features['Longitude'] = coord_match.group(1)
            features['Latitude'] = coord_match.group(2)

            # B. Station Name Extraction: Takes everything before the coordinates
            # This is still fragile, but using the match start point is the best guess.
            # We assume the coordinates start the numerical part.
            station_part = main_segment[:coord_match.start(1)] 
            # Clean up the station name by removing underscores
            features['Station_Name'] = station_part.replace('_', ' ').strip()
        
        # C. Source/Sensor Extraction
        if len(parts) > 1:
            features['Source_Sensor'] = parts[1]
    
    # --- 2. Time/Timestamp Extraction ---
    # Pattern: (YYYYMMDDTHHMMSS)_(YYYYMMDDTHHMMSS)
    time_pattern = r'(\d{8}T\d{6})_(\d{8}T\d{6})'
    time_match = re.search(time_pattern, filename)
    
    if time_match:
        # We are safe here because the 'if time_match:' block guarantees its existence
        features['Start_Time'] = time_match.group(1)
        features['End_Time'] = time_match.group(2)
    
    return pd.Series(features)


# ============================================================================
# 💡 MAIN PIPELINE (Rest of the code remains largely the same)
# ========================================================================

# Define your file paths
csv_path = 'data/data/train.csv'
image_folder_path = './data/data/' # <-- UPDATE THIS PATH!

# Load the metadata
try:
    df = pd.read_csv(csv_path)
    print("Metadata loaded successfully.")
except FileNotFoundError:
    print("Error: train.csv not found. Check the path.")
    exit()


def check_image_paths(df, image_folder_path):
    """Checks if all required images exist in the target folder."""
    print("\n--- Running Image Path Check ---")
    
    available_files = set(os.listdir(image_folder_path))
    required_names = set(df['img_name'])
    
    missing_files = required_names - available_files
    
    if missing_files:
        print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
        df = df[~df['img_name'].isin(missing_files)]
        print(f"Cleaned DataFrame size after filtering missing images: {len(df)}")
    else:
        print("✅ All required images were found in the directory.")
    
    return df

# ---> Execute the check
df = check_image_paths(df, image_folder_path)


def feature_engineer(df):
    """Applies feature extraction logic to the 'img_name' column."""
    print("\n--- Performing Feature Engineering on Filenames ---")
    
    # Apply the function to the 'img_name' column and create new features
    df[['Station_Name', 'Source_Sensor', 'Latitude', 'Longitude', 'Start_Time', 'End_Time']] = \
        df['img_name'].apply(lambda x: extract_features_from_filename(x))
        
    print("✅ Feature Engineering Complete. New columns added.")
    return df


def load_image_data(df, image_folder_path):
    """Loads images and returns a list of (image_data, feature_data) tuples."""
    
    processed_data = []
    
    print("\n--- Loading Images and Features (This may take time) ---")
    
    for index, row in df.iterrows():
        img_name = row['img_name']
        full_path = os.path.join(image_folder_path, img_name)
        image = cv2.imread(full_path)
        
        if image is None:
            print(f"🛑 SKIPPING row {index}: Critical load failure (None). Check file integrity at {full_path}")
            continue
        if image.size == 0:
             print(f"🛑 SKIPPING row {index}: Image loaded, but is empty (Size 0). Check file metadata at {full_path}")
             continue
        
        try:                
            # Collect ALL feature metadata (Pollutants + Engineered Features)
            features = {
                'ozone': row['ozone'],
                'NO2': row['NO2'],
                'AOT': row['AOT'],
                'elevation': row['elevation'],
                # Engineered features added here:
                'Station_Name': row['Station_Name'],
                'Source_Sensor': row['Source_Sensor'],
                'Latitude': row['Latitude'],
                'Longitude': row['Longitude'],
                'Start_Time': row['Start_Time'],
                'End_Time': row['End_Time']
            }
            
            # Store the combination
            processed_data.append({
                'image': image,  # The actual image NumPy array
                'metadata': features
            })
            
        except Exception as e:
            print(f"An error occurred processing row {index}: {e}")
            
    print("\n✅ Data loading complete.")
    return processed_data

# ============================================================================
# 🏁 EXECUTION FLOW
# ============================================================================

# 1. Check paths
df = check_image_paths(df, image_folder_path)

# 2. Feature Engineer the metadata
df = feature_engineer(df)

# 3. Load and combine the data
combined_dataset = load_image_data(df, image_folder_path)

print("\n--- Pipeline Finished ---")
print(f"Successfully processed {len(combined_dataset)} data points.")