]> vgcfreebox.myrthtech.pt Git - ue-rnap-aerossol.git/blob - torch-randomdata-example-Aerossol.py
code executed in ue niia server
[ue-rnap-aerossol.git] / torch-randomdata-example-Aerossol.py
1 import pandas as pd
2 import os
3 import cv2 # OpenCV for image handling
4 import re
5 import numpy as np
6
7 # ============================================================================
8 # 🚀 FEATURE ENGINEERING FUNCTION (UPDATED FOR ROBUSTNESS)
9 # ==========================================================[
10 def extract_features_from_filename(filename):
11 """
12 Extracts structured features from the filename using regex and string splitting.
13 Handles potential None matches gracefully.
14 """
15
16 features = {
17 'Station_Name': None,
18 'Source_Sensor': None,
19 'Latitude': None,
20 'Longitude': None,
21 'Start_Time': None,
22 'End_Time': None
23 }
24
25 # Use the strongest delimiter '__' for splitting
26 parts = filename.split('__')
27
28 # --- 1. Station Name, Coordinates, and Sensor ---
29 if parts:
30 main_segment = parts[0]
31
32 # A. Coordinate Extraction (Most stable regex part)
33 # Pattern: (XX-XX)_[YY-ZZ]
34 coord_match = re.search(r'(\d{2}-\d{2})_(\d{2}-\d{3})', main_segment)
35
36 if coord_match:
37 # We can safely use group(1) and group(2) because we checked for coord_match
38 features['Longitude'] = coord_match.group(1)
39 features['Latitude'] = coord_match.group(2)
40
41 # B. Station Name Extraction: Takes everything before the coordinates
42 # This is still fragile, but using the match start point is the best guess.
43 # We assume the coordinates start the numerical part.
44 station_part = main_segment[:coord_match.start(1)]
45 # Clean up the station name by removing underscores
46 features['Station_Name'] = station_part.replace('_', ' ').strip()
47
48 # C. Source/Sensor Extraction
49 if len(parts) > 1:
50 features['Source_Sensor'] = parts[1]
51
52 # --- 2. Time/Timestamp Extraction ---
53 # Pattern: (YYYYMMDDTHHMMSS)_(YYYYMMDDTHHMMSS)
54 time_pattern = r'(\d{8}T\d{6})_(\d{8}T\d{6})'
55 time_match = re.search(time_pattern, filename)
56
57 if time_match:
58 # We are safe here because the 'if time_match:' block guarantees its existence
59 features['Start_Time'] = time_match.group(1)
60 features['End_Time'] = time_match.group(2)
61
62 return pd.Series(features)
63
64
65 # ============================================================================
66 # 💡 MAIN PIPELINE (Rest of the code remains largely the same)
67 # ========================================================================
68
69 # Define your file paths
70 csv_path = 'data/data/train.csv'
71 image_folder_path = './data/data/' # <-- UPDATE THIS PATH!
72
73 # Load the metadata
74 try:
75 df = pd.read_csv(csv_path)
76 print("Metadata loaded successfully.")
77 except FileNotFoundError:
78 print("Error: train.csv not found. Check the path.")
79 exit()
80
81
82 def check_image_paths(df, image_folder_path):
83 """Checks if all required images exist in the target folder."""
84 print("\n--- Running Image Path Check ---")
85
86 available_files = set(os.listdir(image_folder_path))
87 required_names = set(df['img_name'])
88
89 missing_files = required_names - available_files
90
91 if missing_files:
92 print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
93 df = df[~df['img_name'].isin(missing_files)]
94 print(f"Cleaned DataFrame size after filtering missing images: {len(df)}")
95 else:
96 print("✅ All required images were found in the directory.")
97
98 return df
99
100 # ---> Execute the check
101 df = check_image_paths(df, image_folder_path)
102
103
104 def feature_engineer(df):
105 """Applies feature extraction logic to the 'img_name' column."""
106 print("\n--- Performing Feature Engineering on Filenames ---")
107
108 # Apply the function to the 'img_name' column and create new features
109 df[['Station_Name', 'Source_Sensor', 'Latitude', 'Longitude', 'Start_Time', 'End_Time']] = \
110 df['img_name'].apply(lambda x: extract_features_from_filename(x))
111
112 print("✅ Feature Engineering Complete. New columns added.")
113 return df
114
115
116 def load_image_data(df, image_folder_path):
117 """Loads images and returns a list of (image_data, feature_data) tuples."""
118
119 processed_data = []
120
121 print("\n--- Loading Images and Features (This may take time) ---")
122
123 for index, row in df.iterrows():
124 img_name = row['img_name']
125 full_path = os.path.join(image_folder_path, img_name)
126 image = cv2.imread(full_path)
127
128 if image is None:
129 print(f"🛑 SKIPPING row {index}: Critical load failure (None). Check file integrity at {full_path}")
130 continue
131 if image.size == 0:
132 print(f"🛑 SKIPPING row {index}: Image loaded, but is empty (Size 0). Check file metadata at {full_path}")
133 continue
134
135 try:
136 # Collect ALL feature metadata (Pollutants + Engineered Features)
137 features = {
138 'ozone': row['ozone'],
139 'NO2': row['NO2'],
140 'AOT': row['AOT'],
141 'elevation': row['elevation'],
142 # Engineered features added here:
143 'Station_Name': row['Station_Name'],
144 'Source_Sensor': row['Source_Sensor'],
145 'Latitude': row['Latitude'],
146 'Longitude': row['Longitude'],
147 'Start_Time': row['Start_Time'],
148 'End_Time': row['End_Time']
149 }
150
151 # Store the combination
152 processed_data.append({
153 'image': image, # The actual image NumPy array
154 'metadata': features
155 })
156
157 except Exception as e:
158 print(f"An error occurred processing row {index}: {e}")
159
160 print("\n✅ Data loading complete.")
161 return processed_data
162
163 # ============================================================================
164 # 🏁 EXECUTION FLOW
165 # ============================================================================
166
167 # 1. Check paths
168 df = check_image_paths(df, image_folder_path)
169
170 # 2. Feature Engineer the metadata
171 df = feature_engineer(df)
172
173 # 3. Load and combine the data
174 combined_dataset = load_image_data(df, image_folder_path)
175
176 print("\n--- Pipeline Finished ---")
177 print(f"Successfully processed {len(combined_dataset)} data points.")