]>
vgcfreebox.myrthtech.pt Git - ue-rnap-aerossol.git/blob - torch-randomdata-example-Aerossol.py
3 import cv2
# OpenCV for image handling
7 # ============================================================================
8 # 🚀 FEATURE ENGINEERING FUNCTION (UPDATED FOR ROBUSTNESS)
9 # ==========================================================[
10 def extract_features_from_filename(filename
):
12 Extracts structured features from the filename using regex and string splitting.
13 Handles potential None matches gracefully.
18 'Source_Sensor': None,
25 # Use the strongest delimiter '__' for splitting
26 parts
= filename
.split('__')
28 # --- 1. Station Name, Coordinates, and Sensor ---
30 main_segment
= parts
[0]
32 # A. Coordinate Extraction (Most stable regex part)
33 # Pattern: (XX-XX)_[YY-ZZ]
34 coord_match
= re
.search(r
'(\d{2}-\d{2})_(\d{2}-\d{3})', main_segment
)
37 # We can safely use group(1) and group(2) because we checked for coord_match
38 features
['Longitude'] = coord_match
.group(1)
39 features
['Latitude'] = coord_match
.group(2)
41 # B. Station Name Extraction: Takes everything before the coordinates
42 # This is still fragile, but using the match start point is the best guess.
43 # We assume the coordinates start the numerical part.
44 station_part
= main_segment
[:coord_match
.start(1)]
45 # Clean up the station name by removing underscores
46 features
['Station_Name'] = station_part
.replace('_', ' ').strip()
48 # C. Source/Sensor Extraction
50 features
['Source_Sensor'] = parts
[1]
52 # --- 2. Time/Timestamp Extraction ---
53 # Pattern: (YYYYMMDDTHHMMSS)_(YYYYMMDDTHHMMSS)
54 time_pattern
= r
'(\d{8}T\d{6})_(\d{8}T\d{6})'
55 time_match
= re
.search(time_pattern
, filename
)
58 # We are safe here because the 'if time_match:' block guarantees its existence
59 features
['Start_Time'] = time_match
.group(1)
60 features
['End_Time'] = time_match
.group(2)
62 return pd
.Series(features
)
65 # ============================================================================
66 # 💡 MAIN PIPELINE (Rest of the code remains largely the same)
67 # ========================================================================
69 # Define your file paths
70 csv_path
= 'data/data/train.csv'
71 image_folder_path
= './data/data/' # <-- UPDATE THIS PATH!
75 df
= pd
.read_csv(csv_path
)
76 print("Metadata loaded successfully.")
77 except FileNotFoundError
:
78 print("Error: train.csv not found. Check the path.")
82 def check_image_paths(df
, image_folder_path
):
83 """Checks if all required images exist in the target folder."""
84 print("\n--- Running Image Path Check ---")
86 available_files
= set(os
.listdir(image_folder_path
))
87 required_names
= set(df
['img_name'])
89 missing_files
= required_names
- available_files
92 print(f
"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
93 df
= df
[~df
['img_name'].isin(missing_files
)]
94 print(f
"Cleaned DataFrame size after filtering missing images: {len(df)}")
96 print("✅ All required images were found in the directory.")
100 # ---> Execute the check
101 df
= check_image_paths(df
, image_folder_path
)
104 def feature_engineer(df
):
105 """Applies feature extraction logic to the 'img_name' column."""
106 print("\n--- Performing Feature Engineering on Filenames ---")
108 # Apply the function to the 'img_name' column and create new features
109 df
[['Station_Name', 'Source_Sensor', 'Latitude', 'Longitude', 'Start_Time', 'End_Time']] = \
110 df
['img_name'].apply(lambda x
: extract_features_from_filename(x
))
112 print("✅ Feature Engineering Complete. New columns added.")
116 def load_image_data(df
, image_folder_path
):
117 """Loads images and returns a list of (image_data, feature_data) tuples."""
121 print("\n--- Loading Images and Features (This may take time) ---")
123 for index
, row
in df
.iterrows():
124 img_name
= row
['img_name']
125 full_path
= os
.path
.join(image_folder_path
, img_name
)
126 image
= cv2
.imread(full_path
)
129 print(f
"🛑 SKIPPING row {index}: Critical load failure (None). Check file integrity at {full_path}")
132 print(f
"🛑 SKIPPING row {index}: Image loaded, but is empty (Size 0). Check file metadata at {full_path}")
136 # Collect ALL feature metadata (Pollutants + Engineered Features)
138 'ozone': row
['ozone'],
141 'elevation': row
['elevation'],
142 # Engineered features added here:
143 'Station_Name': row
['Station_Name'],
144 'Source_Sensor': row
['Source_Sensor'],
145 'Latitude': row
['Latitude'],
146 'Longitude': row
['Longitude'],
147 'Start_Time': row
['Start_Time'],
148 'End_Time': row
['End_Time']
151 # Store the combination
152 processed_data
.append({
153 'image': image
, # The actual image NumPy array
157 except Exception as e
:
158 print(f
"An error occurred processing row {index}: {e}")
160 print("\n✅ Data loading complete.")
161 return processed_data
163 # ============================================================================
165 # ============================================================================
168 df
= check_image_paths(df
, image_folder_path
)
170 # 2. Feature Engineer the metadata
171 df
= feature_engineer(df
)
173 # 3. Load and combine the data
174 combined_dataset
= load_image_data(df
, image_folder_path
)
176 print("\n--- Pipeline Finished ---")
177 print(f
"Successfully processed {len(combined_dataset)} data points.")