X-Git-Url: https://vgcfreebox.myrthtech.pt/gitweb/ue-rnap-aerossol.git/blobdiff_plain/511f02de1cd8548ac9fb555654675cc8edf46986..2fff9c8a7b11d06c2fbd51b7780cbbc287b2b730:/torch-randomdata-example-Aerossol.py?ds=inline

diff --git a/torch-randomdata-example-Aerossol.py b/torch-randomdata-example-Aerossol.py
index 2f5189b..415625e 100644
--- a/torch-randomdata-example-Aerossol.py
+++ b/torch-randomdata-example-Aerossol.py
@@ -1,13 +1,70 @@
 import pandas as pd
 import os
 import cv2  # OpenCV for image handling
+import re
+import numpy as np 
 
-# ----------------------------------------------
-#  Task 1: Get data
-#       - load data from files
-#       - validate the existence of all images loaded in folder
-#       - creates numpy array to store data attributes
-# ----------------------------------------------
+# ============================================================================
+# ð FEATURE ENGINEERING FUNCTION (UPDATED FOR ROBUSTNESS)
+# ==========================================================[
+def extract_features_from_filename(filename):
+    """
+    Extracts structured features from the filename using regex and string splitting.
+    Handles potential None matches gracefully.
+    """
+    
+    features = {
+        'Station_Name': None,
+        'Source_Sensor': None,
+        'Latitude': None,
+        'Longitude': None,
+        'Start_Time': None,
+        'End_Time': None
+    }
+    
+    # Use the strongest delimiter '__' for splitting
+    parts = filename.split('__') 
+    
+    # --- 1. Station Name, Coordinates, and Sensor ---
+    if parts:
+        main_segment = parts[0] 
+        
+        # A. Coordinate Extraction (Most stable regex part)
+        # Pattern: (XX-XX)_[YY-ZZ]
+        coord_match = re.search(r'(\d{2}-\d{2})_(\d{2}-\d{3})', main_segment)
+        
+        if coord_match:
+            # We can safely use group(1) and group(2) because we checked for coord_match
+            features['Longitude'] = coord_match.group(1)
+            features['Latitude'] = coord_match.group(2)
+
+            # B. Station Name Extraction: Takes everything before the coordinates
+            # This is still fragile, but using the match start point is the best guess.
+            # We assume the coordinates start the numerical part.
+            station_part = main_segment[:coord_match.start(1)] 
+            # Clean up the station name by removing underscores
+            features['Station_Name'] = station_part.replace('_', ' ').strip()
+        
+        # C. Source/Sensor Extraction
+        if len(parts) > 1:
+            features['Source_Sensor'] = parts[1]
+    
+    # --- 2. Time/Timestamp Extraction ---
+    # Pattern: (YYYYMMDDTHHMMSS)_(YYYYMMDDTHHMMSS)
+    time_pattern = r'(\d{8}T\d{6})_(\d{8}T\d{6})'
+    time_match = re.search(time_pattern, filename)
+    
+    if time_match:
+        # We are safe here because the 'if time_match:' block guarantees its existence
+        features['Start_Time'] = time_match.group(1)
+        features['End_Time'] = time_match.group(2)
+    
+    return pd.Series(features)
+
+
+# ============================================================================
+# ð¡ MAIN PIPELINE (Rest of the code remains largely the same)
+# ========================================================================
 
 # Define your file paths
 csv_path = 'data/data/train.csv'
@@ -18,7 +75,7 @@ try:
     df = pd.read_csv(csv_path)
     print("Metadata loaded successfully.")
 except FileNotFoundError:
-    print("Error: train.csv not found.")
+    print("Error: train.csv not found. Check the path.")
     exit()
 
 
@@ -26,20 +83,15 @@ def check_image_paths(df, image_folder_path):
     """Checks if all required images exist in the target folder."""
     print("\n--- Running Image Path Check ---")
     
-    # Get a set of all names actually present in the folder
     available_files = set(os.listdir(image_folder_path))
-    
-    # The CSV names (must be standardized, e.g., convert to lowercase)
     required_names = set(df['img_name'])
     
-    # Check for missing files
     missing_files = required_names - available_files
     
     if missing_files:
         print(f"ð¨ WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
-        # You might want to filter the DataFrame to only use the rows that have images
         df = df[~df['img_name'].isin(missing_files)]
-        print(f"Cleaned DataFrame size: {len(df)}")
+        print(f"Cleaned DataFrame size after filtering missing images: {len(df)}")
     else:
         print("â All required images were found in the directory.")
     
@@ -49,6 +101,18 @@ def check_image_paths(df, image_folder_path):
 df = check_image_paths(df, image_folder_path)
 
 
+def feature_engineer(df):
+    """Applies feature extraction logic to the 'img_name' column."""
+    print("\n--- Performing Feature Engineering on Filenames ---")
+    
+    # Apply the function to the 'img_name' column and create new features
+    df[['Station_Name', 'Source_Sensor', 'Latitude', 'Longitude', 'Start_Time', 'End_Time']] = \
+        df['img_name'].apply(lambda x: extract_features_from_filename(x))
+        
+    print("â Feature Engineering Complete. New columns added.")
+    return df
+
+
 def load_image_data(df, image_folder_path):
     """Loads images and returns a list of (image_data, feature_data) tuples."""
     
@@ -58,24 +122,30 @@ def load_image_data(df, image_folder_path):
     
     for index, row in df.iterrows():
         img_name = row['img_name']
-        
-        # Construct the full, absolute path
         full_path = os.path.join(image_folder_path, img_name)
+        image = cv2.imread(full_path)
         
-        try:
-            # Load the image using OpenCV
-            image = cv2.imread(full_path)
-            
-            if image is None:
-                print(f"Skipping row {index}: Could not load image at {full_path}")
-                continue
-                
-            # Extract the pollutant/feature metadata
+        if image is None:
+            print(f"ð SKIPPING row {index}: Critical load failure (None). Check file integrity at {full_path}")
+            continue
+        if image.size == 0:
+             print(f"ð SKIPPING row {index}: Image loaded, but is empty (Size 0). Check file metadata at {full_path}")
+             continue
+        
+        try:                
+            # Collect ALL feature metadata (Pollutants + Engineered Features)
             features = {
                 'ozone': row['ozone'],
                 'NO2': row['NO2'],
                 'AOT': row['AOT'],
-                'elevation': row['elevation']
+                'elevation': row['elevation'],
+                # Engineered features added here:
+                'Station_Name': row['Station_Name'],
+                'Source_Sensor': row['Source_Sensor'],
+                'Latitude': row['Latitude'],
+                'Longitude': row['Longitude'],
+                'Start_Time': row['Start_Time'],
+                'End_Time': row['End_Time']
             }
             
             # Store the combination
@@ -87,9 +157,21 @@ def load_image_data(df, image_folder_path):
         except Exception as e:
             print(f"An error occurred processing row {index}: {e}")
             
-    print("â Data loading complete.")
+    print("\nâ Data loading complete.")
     return processed_data
 
-# ---> Execute the full data loading
+# ============================================================================
+# ð EXECUTION FLOW
+# ============================================================================
+
+# 1. Check paths
+df = check_image_paths(df, image_folder_path)
+
+# 2. Feature Engineer the metadata
+df = feature_engineer(df)
+
+# 3. Load and combine the data
 combined_dataset = load_image_data(df, image_folder_path)
 
+print("\n--- Pipeline Finished ---")
+print(f"Successfully processed {len(combined_dataset)} data points.")