#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>

const int m_rows = 40000;
const int m_cols = m_rows;

_Atomic int hv = 0;
int **matrix;

typedef struct{
  int start;
  int end;
} thread_args;

void* parallel_search_cols(void *arg){
  thread_args *args = (thread_args *)arg;
  for(int r = 0; r < m_rows; r++){
    for(int c = args->start; c < args->end; c++){
      if(matrix[r][c] > hv){
        hv = matrix[r][c];
      }
    }
  }
  free(args);
  return NULL;
}

// Function to dynamically create threads and divide columns
void run_parallel_search(int num_threads) {
    // 1. Dynamically allocate arrays for threads and their arguments
    pthread_t *threads = malloc(num_threads * sizeof(pthread_t));
    thread_args *args = malloc(num_threads * sizeof(thread_args));

    if (threads == NULL || args == NULL) {
        perror("Failed to allocate memory for threads");
        exit(1);
    }

    // 2. Calculate the base number of columns each thread will process
    int chunk_size = m_cols / num_threads;

    // 3. Create the threads in a loop
    for (int i = 0; i < num_threads; i++) {
        args[i].start = i * chunk_size;
        
        // If it is the last thread, let it process all remaining columns
        if (i == num_threads - 1) {
            args[i].end = m_cols;
        } else {
            args[i].end = (i + 1) * chunk_size;
        }

        // Pass the address of the specific struct for this thread
        if (pthread_create(&threads[i], NULL, &parallel_search_cols, &args[i]) != 0) {
            perror("Failed to create thread");
            exit(1);
        }
    }

    // 4. Wait for all threads to finish
    for (int i = 0; i < num_threads; i++) {
        pthread_join(threads[i], NULL);
    }

    // 5. Clean up the dynamically allocated arrays
    free(threads);
    // Note: Do not free(args) here IF your thread function (parallel_search_cols) 
    // is already calling free(arg) internally! 
    // Since your original thread function calls free(args), calling it here 
    // would cause a "double free" crash. 
    // However, since we allocated it as one big array, it is better to remove 
    // free(args) from your thread function and do it here:
    free(args); 
}

int main(void){
  printf("1st --> Allocate memory\n");
  clock_t t_t1; t_t1 = clock();
  size_t rows_size = m_rows * sizeof(int*);
  matrix = malloc(rows_size);
  if(matrix == NULL){
    perror("malloc failled");
    return 1;
  }
  size_t data_size = (size_t)m_rows * m_cols * sizeof(int);
  int *data;
  if(posix_memalign((void**)&data, 64, data_size) != 0){
    perror("not able to alocate memory");
    free(matrix);
    return 1;
  }
  t_t1 = clock() - t_t1;
  double t1_ttaken = ((double)t_t1)/CLOCKS_PER_SEC;
  printf(" %f sec\n",t1_ttaken);

  printf("2nd --> Populate the matrix\n");
  clock_t t_t2; t_t2 = clock();
  for(int r = 0; r<m_rows;r++){
    matrix[r] = data + r * m_cols;
  }
  for(int r = 0; r < m_rows; r++){  
    for(int c = 0; c < m_cols; c++){
      matrix[r][c] = r*m_cols+c;
    }
  }
  t_t2 = clock() - t_t2;
  double t2_ttaken = ((double)t_t2)/CLOCKS_PER_SEC; 
  printf(" %f sec\n",t2_ttaken);  
  
  printf("3rd --> Search matrix\n");
  clock_t t_t3; t_t3 = clock();
  int threads_to_use = 4;
  run_parallel_search(threads_to_use);
  t_t3 = clock() - t_t3;
  double t3_ttaken = ((double)t_t3)/CLOCKS_PER_SEC;
  printf(" %f sec\n",t3_ttaken);
 
  printf("Done\n The biggest value found is  %d", hv); 
  free(data);
  free(matrix);
  return 0;
}
