torch-randomdata-example-Aerossol.py

   1 import pandas as pd
   2 import os
   3 import cv2  # OpenCV for image handling
   4 import re
   5 import numpy as np
   6
   7 # ============================================================================
   8 # 🚀 FEATURE ENGINEERING FUNCTION (UPDATED FOR ROBUSTNESS)
   9 # ==========================================================[
  10 def extract_features_from_filename(filename):
  11     """
  12     Extracts structured features from the filename using regex and string splitting.
  13     Handles potential None matches gracefully.
  14     """
  15
  16     features = {
  17         'Station_Name': None,
  18         'Source_Sensor': None,
  19         'Latitude': None,
  20         'Longitude': None,
  21         'Start_Time': None,
  22         'End_Time': None
  23     }
  24
  25     # Use the strongest delimiter '__' for splitting
  26     parts = filename.split('__')
  27
  28     # --- 1. Station Name, Coordinates, and Sensor ---
  29     if parts:
  30         main_segment = parts[0]
  31
  32         # A. Coordinate Extraction (Most stable regex part)
  33         # Pattern: (XX-XX)_[YY-ZZ]
  34         coord_match = re.search(r'(\d{2}-\d{2})_(\d{2}-\d{3})', main_segment)
  35
  36         if coord_match:
  37             # We can safely use group(1) and group(2) because we checked for coord_match
  38             features['Longitude'] = coord_match.group(1)
  39             features['Latitude'] = coord_match.group(2)
  40
  41             # B. Station Name Extraction: Takes everything before the coordinates
  42             # This is still fragile, but using the match start point is the best guess.
  43             # We assume the coordinates start the numerical part.
  44             station_part = main_segment[:coord_match.start(1)]
  45             # Clean up the station name by removing underscores
  46             features['Station_Name'] = station_part.replace('_', ' ').strip()
  47
  48         # C. Source/Sensor Extraction
  49         if len(parts) > 1:
  50             features['Source_Sensor'] = parts[1]
  51
  52     # --- 2. Time/Timestamp Extraction ---
  53     # Pattern: (YYYYMMDDTHHMMSS)_(YYYYMMDDTHHMMSS)
  54     time_pattern = r'(\d{8}T\d{6})_(\d{8}T\d{6})'
  55     time_match = re.search(time_pattern, filename)
  56
  57     if time_match:
  58         # We are safe here because the 'if time_match:' block guarantees its existence
  59         features['Start_Time'] = time_match.group(1)
  60         features['End_Time'] = time_match.group(2)
  61
  62     return pd.Series(features)
  63
  64
  65 # ============================================================================
  66 # 💡 MAIN PIPELINE (Rest of the code remains largely the same)
  67 # ========================================================================
  68
  69 # Define your file paths
  70 csv_path = 'data/data/train.csv'
  71 image_folder_path = './data/data/' # <-- UPDATE THIS PATH!
  72
  73 # Load the metadata
  74 try:
  75     df = pd.read_csv(csv_path)
  76     print("Metadata loaded successfully.")
  77 except FileNotFoundError:
  78     print("Error: train.csv not found. Check the path.")
  79     exit()
  80
  81
  82 def check_image_paths(df, image_folder_path):
  83     """Checks if all required images exist in the target folder."""
  84     print("\n--- Running Image Path Check ---")
  85
  86     available_files = set(os.listdir(image_folder_path))
  87     required_names = set(df['img_name'])
  88
  89     missing_files = required_names - available_files
  90
  91     if missing_files:
  92         print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
  93         df = df[~df['img_name'].isin(missing_files)]
  94         print(f"Cleaned DataFrame size after filtering missing images: {len(df)}")
  95     else:
  96         print("✅ All required images were found in the directory.")
  97
  98     return df
  99
 100 # ---> Execute the check
 101 df = check_image_paths(df, image_folder_path)
 102
 103
 104 def feature_engineer(df):
 105     """Applies feature extraction logic to the 'img_name' column."""
 106     print("\n--- Performing Feature Engineering on Filenames ---")
 107
 108     # Apply the function to the 'img_name' column and create new features
 109     df[['Station_Name', 'Source_Sensor', 'Latitude', 'Longitude', 'Start_Time', 'End_Time']] = \
 110         df['img_name'].apply(lambda x: extract_features_from_filename(x))
 111
 112     print("✅ Feature Engineering Complete. New columns added.")
 113     return df
 114
 115
 116 def load_image_data(df, image_folder_path):
 117     """Loads images and returns a list of (image_data, feature_data) tuples."""
 118
 119     processed_data = []
 120
 121     print("\n--- Loading Images and Features (This may take time) ---")
 122
 123     for index, row in df.iterrows():
 124         img_name = row['img_name']
 125         full_path = os.path.join(image_folder_path, img_name)
 126         image = cv2.imread(full_path)
 127
 128         if image is None:
 129             print(f"🛑 SKIPPING row {index}: Critical load failure (None). Check file integrity at {full_path}")
 130             continue
 131         if image.size == 0:
 132              print(f"🛑 SKIPPING row {index}: Image loaded, but is empty (Size 0). Check file metadata at {full_path}")
 133              continue
 134
 135         try:
 136             # Collect ALL feature metadata (Pollutants + Engineered Features)
 137             features = {
 138                 'ozone': row['ozone'],
 139                 'NO2': row['NO2'],
 140                 'AOT': row['AOT'],
 141                 'elevation': row['elevation'],
 142                 # Engineered features added here:
 143                 'Station_Name': row['Station_Name'],
 144                 'Source_Sensor': row['Source_Sensor'],
 145                 'Latitude': row['Latitude'],
 146                 'Longitude': row['Longitude'],
 147                 'Start_Time': row['Start_Time'],
 148                 'End_Time': row['End_Time']
 149             }
 150
 151             # Store the combination
 152             processed_data.append({
 153                 'image': image,  # The actual image NumPy array
 154                 'metadata': features
 155             })
 156
 157         except Exception as e:
 158             print(f"An error occurred processing row {index}: {e}")
 159
 160     print("\n✅ Data loading complete.")
 161     return processed_data
 162
 163 # ============================================================================
 164 # 🏁 EXECUTION FLOW
 165 # ============================================================================
 166
 167 # 1. Check paths
 168 df = check_image_paths(df, image_folder_path)
 169
 170 # 2. Feature Engineer the metadata
 171 df = feature_engineer(df)
 172
 173 # 3. Load and combine the data
 174 combined_dataset = load_image_data(df, image_folder_path)
 175
 176 print("\n--- Pipeline Finished ---")
 177 print(f"Successfully processed {len(combined_dataset)} data points.")