]> vgcfreebox.myrthtech.pt Git - ue-rnap-aerossol.git/blob - torch-randomdata-example-Aerossol.py
setting up knowladge
[ue-rnap-aerossol.git] / torch-randomdata-example-Aerossol.py
1 import pandas as pd
2 import os
3 import cv2 # OpenCV for image handling
4
5 # ----------------------------------------------
6 # Task 1: Get data
7 # - load data from files
8 # - validate the existence of all images loaded in folder
9 # - creates numpy array to store data attributes
10 # ----------------------------------------------
11
12 # Define your file paths
13 csv_path = 'data/data/train.csv'
14 image_folder_path = './data/data/' # <-- UPDATE THIS PATH!
15
16 # Load the metadata
17 try:
18 df = pd.read_csv(csv_path)
19 print("Metadata loaded successfully.")
20 except FileNotFoundError:
21 print("Error: train.csv not found.")
22 exit()
23
24
25 def check_image_paths(df, image_folder_path):
26 """Checks if all required images exist in the target folder."""
27 print("\n--- Running Image Path Check ---")
28
29 # Get a set of all names actually present in the folder
30 available_files = set(os.listdir(image_folder_path))
31
32 # The CSV names (must be standardized, e.g., convert to lowercase)
33 required_names = set(df['img_name'])
34
35 # Check for missing files
36 missing_files = required_names - available_files
37
38 if missing_files:
39 print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
40 # You might want to filter the DataFrame to only use the rows that have images
41 df = df[~df['img_name'].isin(missing_files)]
42 print(f"Cleaned DataFrame size: {len(df)}")
43 else:
44 print("✅ All required images were found in the directory.")
45
46 return df
47
48 # ---> Execute the check
49 df = check_image_paths(df, image_folder_path)
50
51
52 def load_image_data(df, image_folder_path):
53 """Loads images and returns a list of (image_data, feature_data) tuples."""
54
55 processed_data = []
56
57 print("\n--- Loading Images and Features (This may take time) ---")
58
59 for index, row in df.iterrows():
60 img_name = row['img_name']
61
62 # Construct the full, absolute path
63 full_path = os.path.join(image_folder_path, img_name)
64
65 try:
66 # Load the image using OpenCV
67 image = cv2.imread(full_path)
68
69 if image is None:
70 print(f"Skipping row {index}: Could not load image at {full_path}")
71 continue
72
73 # Extract the pollutant/feature metadata
74 features = {
75 'ozone': row['ozone'],
76 'NO2': row['NO2'],
77 'AOT': row['AOT'],
78 'elevation': row['elevation']
79 }
80
81 # Store the combination
82 processed_data.append({
83 'image': image, # The actual image NumPy array
84 'metadata': features
85 })
86
87 except Exception as e:
88 print(f"An error occurred processing row {index}: {e}")
89
90 print("✅ Data loading complete.")
91 return processed_data
92
93 # ---> Execute the full data loading
94 combined_dataset = load_image_data(df, image_folder_path)
95