From 511f02de1cd8548ac9fb555654675cc8edf46986 Mon Sep 17 00:00:00 2001 From: vitler Date: Tue, 19 May 2026 10:56:43 +0100 Subject: [PATCH] setting up knowladge --- torch-is-cuda-available.py | 18 +++ torch-randomdata-example-Aerossol.py | 95 ++++++++++++++ torch-randomdata-example-MNIST.py | 190 +++++++++++++++++++++++++++ torch-randomdata-example.py | 107 +++++++++++++++ 4 files changed, 410 insertions(+) create mode 100644 torch-is-cuda-available.py create mode 100644 torch-randomdata-example-Aerossol.py create mode 100644 torch-randomdata-example-MNIST.py create mode 100644 torch-randomdata-example.py diff --git a/torch-is-cuda-available.py b/torch-is-cuda-available.py new file mode 100644 index 0000000..145d709 --- /dev/null +++ b/torch-is-cuda-available.py @@ -0,0 +1,18 @@ +import torch + +# Check if GPU is available +if torch.cuda.is_available(): + device = torch.device("cuda") + + # Optional: Print GPU model name for confirmation + gpu_name = torch.cuda.get_device_name(0) + print("="*50) + print(f"✅ GPU Detected and Available! Using device: {device}") + print(f" GPU Model: {gpu_name}") + print("="*50) +else: + device = torch.device("cpu") + print("="*50) + print("⚠️ WARNING: CUDA not available. Falling back to CPU.") + print(" (If you have a GPU, ensure you installed the correct PyTorch version for CUDA.)") + print("="*50) \ No newline at end of file diff --git a/torch-randomdata-example-Aerossol.py b/torch-randomdata-example-Aerossol.py new file mode 100644 index 0000000..2f5189b --- /dev/null +++ b/torch-randomdata-example-Aerossol.py @@ -0,0 +1,95 @@ +import pandas as pd +import os +import cv2 # OpenCV for image handling + +# ---------------------------------------------- +# Task 1: Get data +# - load data from files +# - validate the existence of all images loaded in folder +# - creates numpy array to store data attributes +# ---------------------------------------------- + +# Define your file paths +csv_path = 'data/data/train.csv' +image_folder_path = './data/data/' # <-- UPDATE THIS PATH! + +# Load the metadata +try: + df = pd.read_csv(csv_path) + print("Metadata loaded successfully.") +except FileNotFoundError: + print("Error: train.csv not found.") + exit() + + +def check_image_paths(df, image_folder_path): + """Checks if all required images exist in the target folder.""" + print("\n--- Running Image Path Check ---") + + # Get a set of all names actually present in the folder + available_files = set(os.listdir(image_folder_path)) + + # The CSV names (must be standardized, e.g., convert to lowercase) + required_names = set(df['img_name']) + + # Check for missing files + missing_files = required_names - available_files + + if missing_files: + print(f"🚨 WARNING: {len(missing_files)} images are missing! Examples: {list(missing_files)[:5]}") + # You might want to filter the DataFrame to only use the rows that have images + df = df[~df['img_name'].isin(missing_files)] + print(f"Cleaned DataFrame size: {len(df)}") + else: + print("✅ All required images were found in the directory.") + + return df + +# ---> Execute the check +df = check_image_paths(df, image_folder_path) + + +def load_image_data(df, image_folder_path): + """Loads images and returns a list of (image_data, feature_data) tuples.""" + + processed_data = [] + + print("\n--- Loading Images and Features (This may take time) ---") + + for index, row in df.iterrows(): + img_name = row['img_name'] + + # Construct the full, absolute path + full_path = os.path.join(image_folder_path, img_name) + + try: + # Load the image using OpenCV + image = cv2.imread(full_path) + + if image is None: + print(f"Skipping row {index}: Could not load image at {full_path}") + continue + + # Extract the pollutant/feature metadata + features = { + 'ozone': row['ozone'], + 'NO2': row['NO2'], + 'AOT': row['AOT'], + 'elevation': row['elevation'] + } + + # Store the combination + processed_data.append({ + 'image': image, # The actual image NumPy array + 'metadata': features + }) + + except Exception as e: + print(f"An error occurred processing row {index}: {e}") + + print("✅ Data loading complete.") + return processed_data + +# ---> Execute the full data loading +combined_dataset = load_image_data(df, image_folder_path) + diff --git a/torch-randomdata-example-MNIST.py b/torch-randomdata-example-MNIST.py new file mode 100644 index 0000000..37cc0a9 --- /dev/null +++ b/torch-randomdata-example-MNIST.py @@ -0,0 +1,190 @@ +# ---------------------------------------------- +# 💡 Cell 1: Setup and Imports +# ---------------------------------------------- + +# 1. Import Core Libraries +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader +from torchvision import datasets, transforms +import matplotlib.pyplot as plt +import numpy as np + +# 2. Device Configuration (The most crucial check!) +# This automatically detects and selects the GPU if available. +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print(f"✅ Successfully initialized. Using device: {device}") + +# Optional: Check GPU details (Good for debugging) +if device.type == 'cuda': + print(f"GPU Name: {torch.cuda.get_device_name(0)}") + +# ---------------------------------------------- +# 💡 Cell 2: Data Loading and Transformations +# ---------------------------------------------- + +# Define the preprocessing steps +transform = transforms.Compose([ + transforms.ToTensor(), # Converts the image to a Tensor + transforms.Normalize((0.5,), (0.5,)) # Normalizes pixel values (0 to 1) +]) + +# Download and Load the Dataset (Train set) +train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform) + +# Create the DataLoader (handles batching and shuffling) +BATCH_SIZE = 64 +train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True) + +# Repeat for the Test/Validation set +test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform) +test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False) + +print("✅ Data loaded and prepared successfully!") + +# ---------------------------------------------- +# 💡 Cell 3: Model Definition and GPU Transfer +# ---------------------------------------------- + +# Define the CNN Model Architecture +class SimpleCNN(nn.Module): + def __init__(self): + super(SimpleCNN, self).__init__() + # Convolutional layer: 1 channel in, 16 channels out, 3x3 kernel + self.conv1 = nn.Sequential( + nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2) # Halves the spatial dimensions + ) + # Second convolutional layer + self.conv2 = nn.Sequential( + nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.MaxPool2d(kernel_size=2) + ) + # Fully connected layer (Calculate input size: 32 channels * 7 * 7) + self.fc = nn.Linear(32 * 7 * 7, 10) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + # Flatten the tensor for the linear layer + x = x.view(x.size(0), -1) + x = self.fc(x) + return x + +# Initialize the model +model = SimpleCNN() + +# CRITICAL STEP: Move the entire model's parameters to the GPU +model.to(device) + +print("✅ Model defined and weights transferred to the GPU!") + + +# ---------------------------------------------- +# 💡 Cell 4: The Training Loop +# ---------------------------------------------- + +# Setup hyperparameters +NUM_EPOCHS = 10 +LEARNING_RATE = 0.001 + +# Loss function and Optimizer +criterion = nn.CrossEntropyLoss() +optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) + +# Store history for plotting +loss_history = [] +acc_history = [] + +print("🚀 Starting Training...") + +for epoch in range(NUM_EPOCHS): + # Set model to training mode + model.train() + total_loss = 0 + + for batch_idx, data in enumerate(train_loader): + + # CRITICAL: Move data (inputs and labels) to the GPU! + images = data[0].to(device) + labels = data[1].to(device) + + # 1. Zero Gradients + optimizer.zero_grad() + + # 2. Forward Pass + outputs = model(images) + + # 3. Calculate Loss + loss = criterion(outputs, labels) + + # 4. Backward Pass (The CUDA magic happens here) + # PyTorch automatically handles the graph computation on the GPU. + loss.backward() + + # 5. Optimize (Updates the weights) + optimizer.step() + + total_loss += loss.item() + + # Calculate average loss for the epoch + avg_loss = total_loss / len(train_loader) + loss_history.append(avg_loss) + print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Loss: {avg_loss:.4f}") + +print("🎉 Training Complete!") + + +# ---------------------------------------------- +# 💡 Cell 5: Testing and Visualization +# ---------------------------------------------- + +# Set model to evaluation mode (disables dropout, etc.) +model.eval() +correct = 0 +total = 0 + +with torch.no_grad(): # Context manager that disables gradient tracking (saves memory) + for data in test_loader: + # CRITICAL: Move data to the GPU + images = data[0].to(device) + labels = data[1].to(device) + + outputs = model(images) + + # Get the index of the highest score (the predicted class) + _, predicted = torch.max(outputs.data, 1) + + total += labels.size(0) + correct += (predicted.eq(labels.view_as(predicted))).sum().item() + +accuracy = 100 * correct / total +print(f"\n🌟 Final Test Accuracy: {accuracy:.2f}%") + + +# Visualization (Highly recommended in a Jupyter environment) +plt.figure(figsize=(12, 5)) + +# Plot 1: Loss Curve +plt.subplot(1, 2, 1) +plt.plot(loss_history, marker='o') +plt.title("Training Loss Over Epochs") +plt.xlabel("Epoch") +plt.ylabel("Loss") + +# Plot 2: Conceptual Improvement (You would track accuracy here) +plt.subplot(1, 2, 2) +plt.plot([0] * len(loss_history), label="Dummy Acc.") # Placeholder for accuracy plot +plt.title("Model Performance") +plt.xlabel("Epoch") +plt.ylabel("Accuracy (%)") + +plt.tight_layout() +plt.show() + +# Optional: Save the best model weights +torch.save(model.state_dict(), 'mnist_cnn_model.pth') +print("\nModel weights saved to 'mnist_cnn_model.pth'") diff --git a/torch-randomdata-example.py b/torch-randomdata-example.py new file mode 100644 index 0000000..4354a53 --- /dev/null +++ b/torch-randomdata-example.py @@ -0,0 +1,107 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import Dataset, DataLoader +import numpy as np + + +# ----------------------------------- BUILD MODEL --------------------------------------------- +# 1. Define the Model Class +class SimpleDNN(nn.Module): + def __init__(self, input_size, hidden_size, output_size): + super(SimpleDNN, self).__init__() + # Define the layers (Linear means fully connected) + self.layer1 = nn.Linear(input_size, hidden_size) + self.relu = nn.ReLU() # Activation function + self.layer2 = nn.Linear(hidden_size, output_size) + + def forward(self, x): + # This is the path data takes through the network + x = self.layer1(x) + x = self.relu(x) + x = self.layer2(x) + return x + +# Initialization: Assuming input is 784 (like MNIST flattened image) +INPUT_SIZE = 784 +HIDDEN_SIZE = 128 +OUTPUT_SIZE = 10 # 10 classes +model = SimpleDNN(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE) + + +# ----------------------------------- GPU INTEGRATION --------------------------------------------- +# 1. Check for GPU availability +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print(f"Using device: {device}") + +# 2. Move the entire model to the GPU +model.to(device) + +# Setup +LEARNING_RATE = 0.001 +NUM_EPOCHS = 10 +criterion = nn.CrossEntropyLoss() # Loss function +optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) # Optimizer + + +# ----------------------------------- LOAD DATA --------------------------------------------- +# 1. DEFINE THE CUSTOM DATASET +class CustomDataset(Dataset): + def __init__(self, features, labels): + # features should be the full dataset of inputs (e.g., all 784 pixel values) + self.features = torch.tensor(features, dtype=torch.float32) + # labels should be the full dataset of target labels (integers) + self.labels = torch.tensor(labels, dtype=torch.long) + + def __len__(self): + # Returns the total number of samples + return len(self.features) + + def __getitem__(self, idx): + # Returns a single sample and its label (formatted as a dictionary + # to match your current usage: data['features'], data['labels']) + return { + 'features': self.features[idx], + 'labels': self.labels[idx] + } + +# 2. LOAD THE DATA (REPLACE THIS WITH YOUR ACTUAL LOADING CODE) +DUMMY_FEATURES = np.random.rand(100, INPUT_SIZE).astype(np.float32) +DUMMY_LABELS = np.random.randint(0, OUTPUT_SIZE, 100).astype(np.int64) + +# 3. INSTANTIATE AND WRAP THE LOADER +# Create the dataset object +train_dataset = CustomDataset(DUMMY_FEATURES, DUMMY_LABELS) + +# Create the DataLoader object +BATCH_SIZE = 64 # Choose a desired batch size +train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) + + +# ----------------------------------- TRAINING --------------------------------------------- +# --- The Training Loop --- +for epoch in range(NUM_EPOCHS): + for batch_idx, data in enumerate(train_loader): + + # 1. MOVE DATA TO GPU + inputs = data['features'].to(device) + labels = data['labels'].to(device) + + # 2. ZERO GRADIENTS (Crucial step!) + # Must clear the gradients from the previous step + optimizer.zero_grad() + + # 3. FORWARD PASS + outputs = model(inputs) + + # 4. CALCULATE LOSS + loss = criterion(outputs, labels) + + # 5. BACKWARD PASS (Calculates gradients) + # This is the step that utilizes CUDA for massive parallel computation. + loss.backward() + + # 6. OPTIMIZER STEP (Updates weights) + optimizer.step() + + print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {loss.item():.4f}") -- 2.47.3