#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#include <stdatomic.h>
#include <math.h>

const int m_rows = 40000;
const int m_cols = m_rows;

_Atomic int hv = 0;
int **matrix;

typedef struct {
    int start_r;
    int end_r;
    int start_c;
    int end_c;
} thread_args;


void* parallel_search_blocks(void *arg) {
    thread_args *args = (thread_args *)arg;
    for (int r = args->start_r; r < args->end_r; r++) {
        for (int c = args->start_c; c < args->end_c; c++) {
            if (matrix[r][c] > hv){
                hv = matrix[r][c];
            }
        }
    }
    return NULL;
}

void run_parallel_search(int num_threads){
    pthread_t *threads = malloc(num_threads * sizeof(pthread_t));
    thread_args *args = malloc(num_threads * sizeof(thread_args));

    if (threads == NULL || args == NULL) {
        perror("Failed to allocate memory for threads");
        exit(1);
    }

    int blocks_per_side = (int)sqrt(num_threads);
    int row_chunk = m_rows / blocks_per_side;
    int col_chunk = m_cols / blocks_per_side;

    int t_count = 0;
    for (int i = 0; i < blocks_per_side; i++) {
        for (int j = 0; j < blocks_per_side; j++) {
            args[t_count].start_r = i * row_chunk;
            args[t_count].end_r = (i == blocks_per_side - 1) ? m_rows : (i + 1) * row_chunk;
            
            args[t_count].start_c = j * col_chunk;
            args[t_count].end_c = (j == blocks_per_side - 1) ? m_cols : (j + 1) * col_chunk;

            if (pthread_create(&threads[t_count], NULL, &parallel_search_blocks, &args[t_count]) != 0) {
                perror("Failed to create thread");
                exit(1);
            }
            t_count++;
        }
    }

    for (int i = 0; i < t_count; i++) {
        pthread_join(threads[i], NULL);
    }
    
    free(threads);
    free(args);
}

int main(void) {
    printf("1st --> Allocate memory\n");
    clock_t t_t1 = clock();
    
    matrix = malloc(m_rows * sizeof(int*));
    if (matrix == NULL) { perror("malloc failed"); return 1; }
    
    int *data;
    size_t data_size = (size_t)m_rows * m_cols * sizeof(int);
    if (posix_memalign((void**)&data, 64, data_size) != 0) {
        perror("alignment failed");
        free(matrix);
        return 1;
    }
    
    t_t1 = clock() - t_t1;
    printf(" %f sec\n", ((double)t_t1) / CLOCKS_PER_SEC);

    printf("2nd --> Populate the matrix\n");
    clock_t t_t2 = clock();
    for (int r = 0; r < m_rows; r++) {
        matrix[r] = data + r * m_cols;
    }
    for (int r = 0; r < m_rows; r++) {  
        for (int c = 0; c < m_cols; c++) {
            matrix[r][c] = r * m_cols + c;
        }
    }
    t_t2 = clock() - t_t2;
    printf(" %f sec\n", ((double)t_t2) / CLOCKS_PER_SEC);  
  
    printf("3rd --> Search matrix (Block Split)\n");
    clock_t t_t3 = clock();
    
    int threads_to_use = 8000; 
    run_parallel_search(threads_to_use);
    
    t_t3 = clock() - t_t3;
    printf(" %f sec\n", ((double)t_t3) / CLOCKS_PER_SEC);
 
    printf("Done\n The biggest value found is %d\n", hv); 
    free(data);
    free(matrix);
    return 0;
}