X-Git-Url: https://vgcfreebox.myrthtech.pt/gitweb/ue-rnap-aerossol.git/blobdiff_plain/511f02de1cd8548ac9fb555654675cc8edf46986..2fff9c8a7b11d06c2fbd51b7780cbbc287b2b730:/torch-randomdata-example-Aerossol.py?ds=inline diff --git a/torch-randomdata-example-Aerossol.py b/torch-randomdata-example-Aerossol.py index 2f5189b..415625e 100644 --- a/torch-randomdata-example-Aerossol.py +++ b/torch-randomdata-example-Aerossol.py @@ -1,13 +1,70 @@ import pandas as pd import os import cv2 # OpenCV for image handling +import re +import numpy as np -# ---------------------------------------------- -# Task 1: Get data -# - load data from files -# - validate the existence of all images loaded in folder -# - creates numpy array to store data attributes -# ---------------------------------------------- +# ============================================================================ +# 🚀 FEATURE ENGINEERING FUNCTION (UPDATED FOR ROBUSTNESS) +# ==========================================================[ +def extract_features_from_filename(filename): + """ + Extracts structured features from the filename using regex and string splitting. + Handles potential None matches gracefully. + """ + + features = { + 'Station_Name': None, + 'Source_Sensor': None, + 'Latitude': None, + 'Longitude': None, + 'Start_Time': None, + 'End_Time': None + } + + # Use the strongest delimiter '__' for splitting + parts = filename.split('__') + + # --- 1. Station Name, Coordinates, and Sensor --- + if parts: + main_segment = parts[0] + + # A. Coordinate Extraction (Most stable regex part) + # Pattern: (XX-XX)_[YY-ZZ] + coord_match = re.search(r'(\d{2}-\d{2})_(\d{2}-\d{3})', main_segment) + + if coord_match: + # We can safely use group(1) and group(2) because we checked for coord_match + features['Longitude'] = coord_match.group(1) + features['Latitude'] = coord_match.group(2) + + # B. Station Name Extraction: Takes everything before the coordinates + # This is still fragile, but using the match start point is the best guess. + # We assume the coordinates start the numerical part. + station_part = main_segment[:coord_match.start(1)] + # Clean up the station name by removing underscores + features['Station_Name'] = station_part.replace('_', ' ').strip() + + # C. Source/Sensor Extraction + if len(parts) > 1: + features['Source_Sensor'] = parts[1] + + # --- 2. Time/Timestamp Extraction --- + # Pattern: (YYYYMMDDTHHMMSS)_(YYYYMMDDTHHMMSS) + time_pattern = r'(\d{8}T\d{6})_(\d{8}T\d{6})' + time_match = re.search(time_pattern, filename) + + if time_match: + # We are safe here because the 'if time_match:' block guarantees its existence + features['Start_Time'] = time_match.group(1) + features['End_Time'] = time_match.group(2) + + return pd.Series(features) + + +# ============================================================================ +# 💡 MAIN PIPELINE (Rest of the code remains largely the same) +# ======================================================================== # Define your file paths csv_path = 'data/data/train.csv' @@ -18,7 +75,7 @@ try: df = pd.read_csv(csv_path) print("Metadata loaded successfully.") except FileNotFoundError: - print("Error: train.csv not found.") + print("Error: train.csv not found. Check the path.") exit() @@ -26,20 +83,15 @@ def check_image_paths(df, image_folder_path): """Checks if all required images exist in the target folder.""" print("\n--- Running Image Path Check ---") - # Get a set of all names actually present in the folder available_files = set(os.listdir(image_folder_path)) - - # The CSV names (must be standardized, e.g., convert to lowercase) required_names = set(df['img_name']) - # Check for missing files missing_files = required_names - available_files if missing_files: print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}") - # You might want to filter the DataFrame to only use the rows that have images df = df[~df['img_name'].isin(missing_files)] - print(f"Cleaned DataFrame size: {len(df)}") + print(f"Cleaned DataFrame size after filtering missing images: {len(df)}") else: print("✅ All required images were found in the directory.") @@ -49,6 +101,18 @@ def check_image_paths(df, image_folder_path): df = check_image_paths(df, image_folder_path) +def feature_engineer(df): + """Applies feature extraction logic to the 'img_name' column.""" + print("\n--- Performing Feature Engineering on Filenames ---") + + # Apply the function to the 'img_name' column and create new features + df[['Station_Name', 'Source_Sensor', 'Latitude', 'Longitude', 'Start_Time', 'End_Time']] = \ + df['img_name'].apply(lambda x: extract_features_from_filename(x)) + + print("✅ Feature Engineering Complete. New columns added.") + return df + + def load_image_data(df, image_folder_path): """Loads images and returns a list of (image_data, feature_data) tuples.""" @@ -58,24 +122,30 @@ def load_image_data(df, image_folder_path): for index, row in df.iterrows(): img_name = row['img_name'] - - # Construct the full, absolute path full_path = os.path.join(image_folder_path, img_name) + image = cv2.imread(full_path) - try: - # Load the image using OpenCV - image = cv2.imread(full_path) - - if image is None: - print(f"Skipping row {index}: Could not load image at {full_path}") - continue - - # Extract the pollutant/feature metadata + if image is None: + print(f"🛑 SKIPPING row {index}: Critical load failure (None). Check file integrity at {full_path}") + continue + if image.size == 0: + print(f"🛑 SKIPPING row {index}: Image loaded, but is empty (Size 0). Check file metadata at {full_path}") + continue + + try: + # Collect ALL feature metadata (Pollutants + Engineered Features) features = { 'ozone': row['ozone'], 'NO2': row['NO2'], 'AOT': row['AOT'], - 'elevation': row['elevation'] + 'elevation': row['elevation'], + # Engineered features added here: + 'Station_Name': row['Station_Name'], + 'Source_Sensor': row['Source_Sensor'], + 'Latitude': row['Latitude'], + 'Longitude': row['Longitude'], + 'Start_Time': row['Start_Time'], + 'End_Time': row['End_Time'] } # Store the combination @@ -87,9 +157,21 @@ def load_image_data(df, image_folder_path): except Exception as e: print(f"An error occurred processing row {index}: {e}") - print("✅ Data loading complete.") + print("\n✅ Data loading complete.") return processed_data -# ---> Execute the full data loading +# ============================================================================ +# 🏁 EXECUTION FLOW +# ============================================================================ + +# 1. Check paths +df = check_image_paths(df, image_folder_path) + +# 2. Feature Engineer the metadata +df = feature_engineer(df) + +# 3. Load and combine the data combined_dataset = load_image_data(df, image_folder_path) +print("\n--- Pipeline Finished ---") +print(f"Successfully processed {len(combined_dataset)} data points.")