+# ============================================================================
+# 🚀 FEATURE ENGINEERING FUNCTION (UPDATED FOR ROBUSTNESS)
+# ==========================================================[
+def extract_features_from_filename(filename):
+ """
+ Extracts structured features from the filename using regex and string splitting.
+ Handles potential None matches gracefully.
+ """
+
+ features = {
+ 'Station_Name': None,
+ 'Source_Sensor': None,
+ 'Latitude': None,
+ 'Longitude': None,
+ 'Start_Time': None,
+ 'End_Time': None
+ }
+
+ # Use the strongest delimiter '__' for splitting
+ parts = filename.split('__')
+
+ # --- 1. Station Name, Coordinates, and Sensor ---
+ if parts:
+ main_segment = parts[0]
+
+ # A. Coordinate Extraction (Most stable regex part)
+ # Pattern: (XX-XX)_[YY-ZZ]
+ coord_match = re.search(r'(\d{2}-\d{2})_(\d{2}-\d{3})', main_segment)
+
+ if coord_match:
+ # We can safely use group(1) and group(2) because we checked for coord_match
+ features['Longitude'] = coord_match.group(1)
+ features['Latitude'] = coord_match.group(2)
+
+ # B. Station Name Extraction: Takes everything before the coordinates
+ # This is still fragile, but using the match start point is the best guess.
+ # We assume the coordinates start the numerical part.
+ station_part = main_segment[:coord_match.start(1)]
+ # Clean up the station name by removing underscores
+ features['Station_Name'] = station_part.replace('_', ' ').strip()
+
+ # C. Source/Sensor Extraction
+ if len(parts) > 1:
+ features['Source_Sensor'] = parts[1]
+
+ # --- 2. Time/Timestamp Extraction ---
+ # Pattern: (YYYYMMDDTHHMMSS)_(YYYYMMDDTHHMMSS)
+ time_pattern = r'(\d{8}T\d{6})_(\d{8}T\d{6})'
+ time_match = re.search(time_pattern, filename)
+
+ if time_match:
+ # We are safe here because the 'if time_match:' block guarantees its existence
+ features['Start_Time'] = time_match.group(1)
+ features['End_Time'] = time_match.group(2)
+
+ return pd.Series(features)
+
+
+# ============================================================================
+# 💡 MAIN PIPELINE (Rest of the code remains largely the same)
+# ========================================================================