From 511f02de1cd8548ac9fb555654675cc8edf46986 Mon Sep 17 00:00:00 2001
From: vitler <vitor.goncalo.costa@gmail.com>
Date: Tue, 19 May 2026 10:56:43 +0100
Subject: [PATCH] setting up knowladge

---
 torch-is-cuda-available.py           |  18 +++
 torch-randomdata-example-Aerossol.py |  95 ++++++++++++++
 torch-randomdata-example-MNIST.py    | 190 +++++++++++++++++++++++++++
 torch-randomdata-example.py          | 107 +++++++++++++++
 4 files changed, 410 insertions(+)
 create mode 100644 torch-is-cuda-available.py
 create mode 100644 torch-randomdata-example-Aerossol.py
 create mode 100644 torch-randomdata-example-MNIST.py
 create mode 100644 torch-randomdata-example.py

diff --git a/torch-is-cuda-available.py b/torch-is-cuda-available.py
new file mode 100644
index 0000000..145d709
--- /dev/null
+++ b/torch-is-cuda-available.py
@@ -0,0 +1,18 @@
+import torch
+
+# Check if GPU is available
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    
+    # Optional: Print GPU model name for confirmation
+    gpu_name = torch.cuda.get_device_name(0)
+    print("="*50)
+    print(f"â GPU Detected and Available! Using device: {device}")
+    print(f"   GPU Model: {gpu_name}")
+    print("="*50)
+else:
+    device = torch.device("cpu")
+    print("="*50)
+    print("â ï¸ WARNING: CUDA not available. Falling back to CPU.")
+    print("   (If you have a GPU, ensure you installed the correct PyTorch version for CUDA.)")
+    print("="*50)
\ No newline at end of file
diff --git a/torch-randomdata-example-Aerossol.py b/torch-randomdata-example-Aerossol.py
new file mode 100644
index 0000000..2f5189b
--- /dev/null
+++ b/torch-randomdata-example-Aerossol.py
@@ -0,0 +1,95 @@
+import pandas as pd
+import os
+import cv2  # OpenCV for image handling
+
+# ----------------------------------------------
+#  Task 1: Get data
+#       - load data from files
+#       - validate the existence of all images loaded in folder
+#       - creates numpy array to store data attributes
+# ----------------------------------------------
+
+# Define your file paths
+csv_path = 'data/data/train.csv'
+image_folder_path = './data/data/' # <-- UPDATE THIS PATH!
+
+# Load the metadata
+try:
+    df = pd.read_csv(csv_path)
+    print("Metadata loaded successfully.")
+except FileNotFoundError:
+    print("Error: train.csv not found.")
+    exit()
+
+
+def check_image_paths(df, image_folder_path):
+    """Checks if all required images exist in the target folder."""
+    print("\n--- Running Image Path Check ---")
+    
+    # Get a set of all names actually present in the folder
+    available_files = set(os.listdir(image_folder_path))
+    
+    # The CSV names (must be standardized, e.g., convert to lowercase)
+    required_names = set(df['img_name'])
+    
+    # Check for missing files
+    missing_files = required_names - available_files
+    
+    if missing_files:
+        print(f"ð¨ WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}")
+        # You might want to filter the DataFrame to only use the rows that have images
+        df = df[~df['img_name'].isin(missing_files)]
+        print(f"Cleaned DataFrame size: {len(df)}")
+    else:
+        print("â All required images were found in the directory.")
+    
+    return df
+
+# ---> Execute the check
+df = check_image_paths(df, image_folder_path)
+
+
+def load_image_data(df, image_folder_path):
+    """Loads images and returns a list of (image_data, feature_data) tuples."""
+    
+    processed_data = []
+    
+    print("\n--- Loading Images and Features (This may take time) ---")
+    
+    for index, row in df.iterrows():
+        img_name = row['img_name']
+        
+        # Construct the full, absolute path
+        full_path = os.path.join(image_folder_path, img_name)
+        
+        try:
+            # Load the image using OpenCV
+            image = cv2.imread(full_path)
+            
+            if image is None:
+                print(f"Skipping row {index}: Could not load image at {full_path}")
+                continue
+                
+            # Extract the pollutant/feature metadata
+            features = {
+                'ozone': row['ozone'],
+                'NO2': row['NO2'],
+                'AOT': row['AOT'],
+                'elevation': row['elevation']
+            }
+            
+            # Store the combination
+            processed_data.append({
+                'image': image,  # The actual image NumPy array
+                'metadata': features
+            })
+            
+        except Exception as e:
+            print(f"An error occurred processing row {index}: {e}")
+            
+    print("â Data loading complete.")
+    return processed_data
+
+# ---> Execute the full data loading
+combined_dataset = load_image_data(df, image_folder_path)
+
diff --git a/torch-randomdata-example-MNIST.py b/torch-randomdata-example-MNIST.py
new file mode 100644
index 0000000..37cc0a9
--- /dev/null
+++ b/torch-randomdata-example-MNIST.py
@@ -0,0 +1,190 @@
+# ----------------------------------------------
+# ð¡ Cell 1: Setup and Imports
+# ----------------------------------------------
+
+# 1. Import Core Libraries
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+import matplotlib.pyplot as plt
+import numpy as np
+
+# 2. Device Configuration (The most crucial check!)
+# This automatically detects and selects the GPU if available.
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"â Successfully initialized. Using device: {device}")
+
+# Optional: Check GPU details (Good for debugging)
+if device.type == 'cuda':
+    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
+
+# ----------------------------------------------
+# ð¡ Cell 2: Data Loading and Transformations
+# ----------------------------------------------
+
+# Define the preprocessing steps
+transform = transforms.Compose([
+    transforms.ToTensor(),  # Converts the image to a Tensor
+    transforms.Normalize((0.5,), (0.5,)) # Normalizes pixel values (0 to 1)
+])
+
+# Download and Load the Dataset (Train set)
+train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
+
+# Create the DataLoader (handles batching and shuffling)
+BATCH_SIZE = 64
+train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+
+# Repeat for the Test/Validation set
+test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
+test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)
+
+print("â Data loaded and prepared successfully!")
+
+# ----------------------------------------------
+# ð¡ Cell 3: Model Definition and GPU Transfer
+# ----------------------------------------------
+
+# Define the CNN Model Architecture
+class SimpleCNN(nn.Module):
+    def __init__(self):
+        super(SimpleCNN, self).__init__()
+        # Convolutional layer: 1 channel in, 16 channels out, 3x3 kernel
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2) # Halves the spatial dimensions
+        )
+        # Second convolutional layer
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2)
+        )
+        # Fully connected layer (Calculate input size: 32 channels * 7 * 7)
+        self.fc = nn.Linear(32 * 7 * 7, 10) 
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        # Flatten the tensor for the linear layer
+        x = x.view(x.size(0), -1) 
+        x = self.fc(x)
+        return x
+
+# Initialize the model
+model = SimpleCNN()
+
+# CRITICAL STEP: Move the entire model's parameters to the GPU
+model.to(device) 
+
+print("â Model defined and weights transferred to the GPU!")
+
+
+# ----------------------------------------------
+# ð¡ Cell 4: The Training Loop
+# ----------------------------------------------
+
+# Setup hyperparameters
+NUM_EPOCHS = 10
+LEARNING_RATE = 0.001
+
+# Loss function and Optimizer
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
+
+# Store history for plotting
+loss_history = []
+acc_history = []
+
+print("ð Starting Training...")
+
+for epoch in range(NUM_EPOCHS):
+    # Set model to training mode
+    model.train() 
+    total_loss = 0
+    
+    for batch_idx, data in enumerate(train_loader):
+
+        # CRITICAL: Move data (inputs and labels) to the GPU!
+        images = data[0].to(device)
+        labels = data[1].to(device)
+        
+        # 1. Zero Gradients
+        optimizer.zero_grad()
+        
+        # 2. Forward Pass
+        outputs = model(images)
+        
+        # 3. Calculate Loss
+        loss = criterion(outputs, labels)
+        
+        # 4. Backward Pass (The CUDA magic happens here)
+        # PyTorch automatically handles the graph computation on the GPU.
+        loss.backward() 
+        
+        # 5. Optimize (Updates the weights)
+        optimizer.step()
+        
+        total_loss += loss.item()
+    
+    # Calculate average loss for the epoch
+    avg_loss = total_loss / len(train_loader)
+    loss_history.append(avg_loss)
+    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Loss: {avg_loss:.4f}")
+
+print("ð Training Complete!")
+
+
+# ----------------------------------------------
+# ð¡ Cell 5: Testing and Visualization
+# ----------------------------------------------
+
+# Set model to evaluation mode (disables dropout, etc.)
+model.eval() 
+correct = 0
+total = 0
+
+with torch.no_grad(): # Context manager that disables gradient tracking (saves memory)
+    for data in test_loader:
+        # CRITICAL: Move data to the GPU
+        images = data[0].to(device)
+        labels = data[1].to(device)
+        
+        outputs = model(images)
+        
+        # Get the index of the highest score (the predicted class)
+        _, predicted = torch.max(outputs.data, 1)
+        
+        total += labels.size(0)
+        correct += (predicted.eq(labels.view_as(predicted))).sum().item()
+
+accuracy = 100 * correct / total
+print(f"\nð Final Test Accuracy: {accuracy:.2f}%")
+
+
+# Visualization (Highly recommended in a Jupyter environment)
+plt.figure(figsize=(12, 5))
+
+# Plot 1: Loss Curve
+plt.subplot(1, 2, 1)
+plt.plot(loss_history, marker='o')
+plt.title("Training Loss Over Epochs")
+plt.xlabel("Epoch")
+plt.ylabel("Loss")
+
+# Plot 2: Conceptual Improvement (You would track accuracy here)
+plt.subplot(1, 2, 2)
+plt.plot([0] * len(loss_history), label="Dummy Acc.") # Placeholder for accuracy plot
+plt.title("Model Performance")
+plt.xlabel("Epoch")
+plt.ylabel("Accuracy (%)")
+
+plt.tight_layout()
+plt.show()
+
+# Optional: Save the best model weights
+torch.save(model.state_dict(), 'mnist_cnn_model.pth')
+print("\nModel weights saved to 'mnist_cnn_model.pth'")
diff --git a/torch-randomdata-example.py b/torch-randomdata-example.py
new file mode 100644
index 0000000..4354a53
--- /dev/null
+++ b/torch-randomdata-example.py
@@ -0,0 +1,107 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+
+
+# ----------------------------------- BUILD MODEL ---------------------------------------------
+# 1. Define the Model Class
+class SimpleDNN(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(SimpleDNN, self).__init__()
+        # Define the layers (Linear means fully connected)
+        self.layer1 = nn.Linear(input_size, hidden_size)
+        self.relu = nn.ReLU() # Activation function
+        self.layer2 = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x):
+        # This is the path data takes through the network
+        x = self.layer1(x)
+        x = self.relu(x)
+        x = self.layer2(x)
+        return x
+
+# Initialization: Assuming input is 784 (like MNIST flattened image)
+INPUT_SIZE = 784
+HIDDEN_SIZE = 128
+OUTPUT_SIZE = 10 # 10 classes
+model = SimpleDNN(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
+
+
+# ----------------------------------- GPU INTEGRATION ---------------------------------------------
+# 1. Check for GPU availability
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+
+# 2. Move the entire model to the GPU
+model.to(device) 
+
+# Setup
+LEARNING_RATE = 0.001
+NUM_EPOCHS = 10
+criterion = nn.CrossEntropyLoss() # Loss function
+optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) # Optimizer
+
+
+# ----------------------------------- LOAD DATA ---------------------------------------------
+# 1. DEFINE THE CUSTOM DATASET
+class CustomDataset(Dataset):
+    def __init__(self, features, labels):
+        # features should be the full dataset of inputs (e.g., all 784 pixel values)
+        self.features = torch.tensor(features, dtype=torch.float32)
+        # labels should be the full dataset of target labels (integers)
+        self.labels = torch.tensor(labels, dtype=torch.long)
+
+    def __len__(self):
+        # Returns the total number of samples
+        return len(self.features)
+
+    def __getitem__(self, idx):
+        # Returns a single sample and its label (formatted as a dictionary 
+        # to match your current usage: data['features'], data['labels'])
+        return {
+            'features': self.features[idx],
+            'labels': self.labels[idx]
+        }
+
+# 2. LOAD THE DATA (REPLACE THIS WITH YOUR ACTUAL LOADING CODE)
+DUMMY_FEATURES = np.random.rand(100, INPUT_SIZE).astype(np.float32)
+DUMMY_LABELS = np.random.randint(0, OUTPUT_SIZE, 100).astype(np.int64)
+
+# 3. INSTANTIATE AND WRAP THE LOADER
+# Create the dataset object
+train_dataset = CustomDataset(DUMMY_FEATURES, DUMMY_LABELS)
+
+# Create the DataLoader object
+BATCH_SIZE = 64 # Choose a desired batch size
+train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+
+
+# ----------------------------------- TRAINING ---------------------------------------------
+# --- The Training Loop ---
+for epoch in range(NUM_EPOCHS):
+    for batch_idx, data in enumerate(train_loader):
+        
+        # 1. MOVE DATA TO GPU
+        inputs = data['features'].to(device)
+        labels = data['labels'].to(device)
+
+        # 2. ZERO GRADIENTS (Crucial step!)
+        # Must clear the gradients from the previous step
+        optimizer.zero_grad() 
+
+        # 3. FORWARD PASS
+        outputs = model(inputs)
+        
+        # 4. CALCULATE LOSS
+        loss = criterion(outputs, labels)
+        
+        # 5. BACKWARD PASS (Calculates gradients)
+        # This is the step that utilizes CUDA for massive parallel computation.
+        loss.backward() 
+        
+        # 6. OPTIMIZER STEP (Updates weights)
+        optimizer.step() 
+        
+    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss.item():.4f}")
-- 
2.47.3