torch-randomdata-example-Aerossol.py

   1 import pandas as pd
   2 import os
   3 import cv2  # OpenCV for image handling
   4
   5 # ----------------------------------------------
   6 #  Task 1: Get data
   7 #       - load data from files
   8 #       - validate the existence of all images loaded in folder
   9 #       - creates numpy array to store data attributes
  10 # ----------------------------------------------
  11
  12 # Define your file paths
  13 csv_path = 'data/data/train.csv'
  14 image_folder_path = './data/data/' # <-- UPDATE THIS PATH!
  15
  16 # Load the metadata
  17 try:
  18     df = pd.read_csv(csv_path)
  19     print("Metadata loaded successfully.")
  20 except FileNotFoundError:
  21     print("Error: train.csv not found.")
  22     exit()
  23
  24
  25 def check_image_paths(df, image_folder_path):
  26     """Checks if all required images exist in the target folder."""
  27     print("\n--- Running Image Path Check ---")
  28
  29     # Get a set of all names actually present in the folder
  30     available_files = set(os.listdir(image_folder_path))
  31
  32     # The CSV names (must be standardized, e.g., convert to lowercase)
  33     required_names = set(df['img_name'])
  34
  35     # Check for missing files
  36     missing_files = required_names - available_files
  37
  38     if missing_files:
  39         print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
  40         # You might want to filter the DataFrame to only use the rows that have images
  41         df = df[~df['img_name'].isin(missing_files)]
  42         print(f"Cleaned DataFrame size: {len(df)}")
  43     else:
  44         print("✅ All required images were found in the directory.")
  45
  46     return df
  47
  48 # ---> Execute the check
  49 df = check_image_paths(df, image_folder_path)
  50
  51
  52 def load_image_data(df, image_folder_path):
  53     """Loads images and returns a list of (image_data, feature_data) tuples."""
  54
  55     processed_data = []
  56
  57     print("\n--- Loading Images and Features (This may take time) ---")
  58
  59     for index, row in df.iterrows():
  60         img_name = row['img_name']
  61
  62         # Construct the full, absolute path
  63         full_path = os.path.join(image_folder_path, img_name)
  64
  65         try:
  66             # Load the image using OpenCV
  67             image = cv2.imread(full_path)
  68
  69             if image is None:
  70                 print(f"Skipping row {index}: Could not load image at {full_path}")
  71                 continue
  72
  73             # Extract the pollutant/feature metadata
  74             features = {
  75                 'ozone': row['ozone'],
  76                 'NO2': row['NO2'],
  77                 'AOT': row['AOT'],
  78                 'elevation': row['elevation']
  79             }
  80
  81             # Store the combination
  82             processed_data.append({
  83                 'image': image,  # The actual image NumPy array
  84                 'metadata': features
  85             })
  86
  87         except Exception as e:
  88             print(f"An error occurred processing row {index}: {e}")
  89
  90     print("✅ Data loading complete.")
  91     return processed_data
  92
  93 # ---> Execute the full data loading
  94 combined_dataset = load_image_data(df, image_folder_path)
  95