import pandas as pd
import os
import cv2 # OpenCV for image handling
+import re
+import numpy as np
-# ----------------------------------------------
-# Task 1: Get data
-# - load data from files
-# - validate the existence of all images loaded in folder
-# - creates numpy array to store data attributes
-# ----------------------------------------------
+# ============================================================================
+# 🚀 FEATURE ENGINEERING FUNCTION (UPDATED FOR ROBUSTNESS)
+# ==========================================================[
+def extract_features_from_filename(filename):
+ """
+ Extracts structured features from the filename using regex and string splitting.
+ Handles potential None matches gracefully.
+ """
+
+ features = {
+ 'Station_Name': None,
+ 'Source_Sensor': None,
+ 'Latitude': None,
+ 'Longitude': None,
+ 'Start_Time': None,
+ 'End_Time': None
+ }
+
+ # Use the strongest delimiter '__' for splitting
+ parts = filename.split('__')
+
+ # --- 1. Station Name, Coordinates, and Sensor ---
+ if parts:
+ main_segment = parts[0]
+
+ # A. Coordinate Extraction (Most stable regex part)
+ # Pattern: (XX-XX)_[YY-ZZ]
+ coord_match = re.search(r'(\d{2}-\d{2})_(\d{2}-\d{3})', main_segment)
+
+ if coord_match:
+ # We can safely use group(1) and group(2) because we checked for coord_match
+ features['Longitude'] = coord_match.group(1)
+ features['Latitude'] = coord_match.group(2)
+
+ # B. Station Name Extraction: Takes everything before the coordinates
+ # This is still fragile, but using the match start point is the best guess.
+ # We assume the coordinates start the numerical part.
+ station_part = main_segment[:coord_match.start(1)]
+ # Clean up the station name by removing underscores
+ features['Station_Name'] = station_part.replace('_', ' ').strip()
+
+ # C. Source/Sensor Extraction
+ if len(parts) > 1:
+ features['Source_Sensor'] = parts[1]
+
+ # --- 2. Time/Timestamp Extraction ---
+ # Pattern: (YYYYMMDDTHHMMSS)_(YYYYMMDDTHHMMSS)
+ time_pattern = r'(\d{8}T\d{6})_(\d{8}T\d{6})'
+ time_match = re.search(time_pattern, filename)
+
+ if time_match:
+ # We are safe here because the 'if time_match:' block guarantees its existence
+ features['Start_Time'] = time_match.group(1)
+ features['End_Time'] = time_match.group(2)
+
+ return pd.Series(features)
+
+
+# ============================================================================
+# 💡 MAIN PIPELINE (Rest of the code remains largely the same)
+# ========================================================================
# Define your file paths
csv_path = 'data/data/train.csv'
df = pd.read_csv(csv_path)
print("Metadata loaded successfully.")
except FileNotFoundError:
- print("Error: train.csv not found.")
+ print("Error: train.csv not found. Check the path.")
exit()
"""Checks if all required images exist in the target folder."""
print("\n--- Running Image Path Check ---")
- # Get a set of all names actually present in the folder
available_files = set(os.listdir(image_folder_path))
-
- # The CSV names (must be standardized, e.g., convert to lowercase)
required_names = set(df['img_name'])
- # Check for missing files
missing_files = required_names - available_files
if missing_files:
print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
- # You might want to filter the DataFrame to only use the rows that have images
df = df[~df['img_name'].isin(missing_files)]
- print(f"Cleaned DataFrame size: {len(df)}")
+ print(f"Cleaned DataFrame size after filtering missing images: {len(df)}")
else:
print("✅ All required images were found in the directory.")
df = check_image_paths(df, image_folder_path)
+def feature_engineer(df):
+ """Applies feature extraction logic to the 'img_name' column."""
+ print("\n--- Performing Feature Engineering on Filenames ---")
+
+ # Apply the function to the 'img_name' column and create new features
+ df[['Station_Name', 'Source_Sensor', 'Latitude', 'Longitude', 'Start_Time', 'End_Time']] = \
+ df['img_name'].apply(lambda x: extract_features_from_filename(x))
+
+ print("✅ Feature Engineering Complete. New columns added.")
+ return df
+
+
def load_image_data(df, image_folder_path):
"""Loads images and returns a list of (image_data, feature_data) tuples."""
for index, row in df.iterrows():
img_name = row['img_name']
-
- # Construct the full, absolute path
full_path = os.path.join(image_folder_path, img_name)
+ image = cv2.imread(full_path)
- try:
- # Load the image using OpenCV
- image = cv2.imread(full_path)
-
- if image is None:
- print(f"Skipping row {index}: Could not load image at {full_path}")
- continue
-
- # Extract the pollutant/feature metadata
+ if image is None:
+ print(f"🛑 SKIPPING row {index}: Critical load failure (None). Check file integrity at {full_path}")
+ continue
+ if image.size == 0:
+ print(f"🛑 SKIPPING row {index}: Image loaded, but is empty (Size 0). Check file metadata at {full_path}")
+ continue
+
+ try:
+ # Collect ALL feature metadata (Pollutants + Engineered Features)
features = {
'ozone': row['ozone'],
'NO2': row['NO2'],
'AOT': row['AOT'],
- 'elevation': row['elevation']
+ 'elevation': row['elevation'],
+ # Engineered features added here:
+ 'Station_Name': row['Station_Name'],
+ 'Source_Sensor': row['Source_Sensor'],
+ 'Latitude': row['Latitude'],
+ 'Longitude': row['Longitude'],
+ 'Start_Time': row['Start_Time'],
+ 'End_Time': row['End_Time']
}
# Store the combination
except Exception as e:
print(f"An error occurred processing row {index}: {e}")
- print("✅ Data loading complete.")
+ print("\n✅ Data loading complete.")
return processed_data
-# ---> Execute the full data loading
+# ============================================================================
+# 🏁 EXECUTION FLOW
+# ============================================================================
+
+# 1. Check paths
+df = check_image_paths(df, image_folder_path)
+
+# 2. Feature Engineer the metadata
+df = feature_engineer(df)
+
+# 3. Load and combine the data
combined_dataset = load_image_data(df, image_folder_path)
+print("\n--- Pipeline Finished ---")
+print(f"Successfully processed {len(combined_dataset)} data points.")