--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <time.h>
+#include <stdatomic.h>
+#include <math.h>
+
+const int m_rows = 40000;
+const int m_cols = m_rows;
+
+_Atomic int hv = 0;
+int **matrix;
+
+typedef struct {
+ int start_r;
+ int end_r;
+ int start_c;
+ int end_c;
+} thread_args;
+
+
+void* parallel_search_blocks(void *arg) {
+ thread_args *args = (thread_args *)arg;
+ for (int r = args->start_r; r < args->end_r; r++) {
+ for (int c = args->start_c; c < args->end_c; c++) {
+ if (matrix[r][c] > hv){
+ hv = matrix[r][c];
+ }
+ }
+ }
+ return NULL;
+}
+
+void run_parallel_search(int num_threads){
+ pthread_t *threads = malloc(num_threads * sizeof(pthread_t));
+ thread_args *args = malloc(num_threads * sizeof(thread_args));
+
+ if (threads == NULL || args == NULL) {
+ perror("Failed to allocate memory for threads");
+ exit(1);
+ }
+
+ int blocks_per_side = (int)sqrt(num_threads);
+ int row_chunk = m_rows / blocks_per_side;
+ int col_chunk = m_cols / blocks_per_side;
+
+ int t_count = 0;
+ for (int i = 0; i < blocks_per_side; i++) {
+ for (int j = 0; j < blocks_per_side; j++) {
+ args[t_count].start_r = i * row_chunk;
+ args[t_count].end_r = (i == blocks_per_side - 1) ? m_rows : (i + 1) * row_chunk;
+
+ args[t_count].start_c = j * col_chunk;
+ args[t_count].end_c = (j == blocks_per_side - 1) ? m_cols : (j + 1) * col_chunk;
+
+ if (pthread_create(&threads[t_count], NULL, ¶llel_search_blocks, &args[t_count]) != 0) {
+ perror("Failed to create thread");
+ exit(1);
+ }
+ t_count++;
+ }
+ }
+
+ for (int i = 0; i < t_count; i++) {
+ pthread_join(threads[i], NULL);
+ }
+
+ free(threads);
+ free(args);
+}
+
+int main(void) {
+ printf("1st --> Allocate memory\n");
+ clock_t t_t1 = clock();
+
+ matrix = malloc(m_rows * sizeof(int*));
+ if (matrix == NULL) { perror("malloc failed"); return 1; }
+
+ int *data;
+ size_t data_size = (size_t)m_rows * m_cols * sizeof(int);
+ if (posix_memalign((void**)&data, 64, data_size) != 0) {
+ perror("alignment failed");
+ free(matrix);
+ return 1;
+ }
+
+ t_t1 = clock() - t_t1;
+ printf(" %f sec\n", ((double)t_t1) / CLOCKS_PER_SEC);
+
+ printf("2nd --> Populate the matrix\n");
+ clock_t t_t2 = clock();
+ for (int r = 0; r < m_rows; r++) {
+ matrix[r] = data + r * m_cols;
+ }
+ for (int r = 0; r < m_rows; r++) {
+ for (int c = 0; c < m_cols; c++) {
+ matrix[r][c] = r * m_cols + c;
+ }
+ }
+ t_t2 = clock() - t_t2;
+ printf(" %f sec\n", ((double)t_t2) / CLOCKS_PER_SEC);
+
+ printf("3rd --> Search matrix (Block Split)\n");
+ clock_t t_t3 = clock();
+
+ int threads_to_use = 8000;
+ run_parallel_search(threads_to_use);
+
+ t_t3 = clock() - t_t3;
+ printf(" %f sec\n", ((double)t_t3) / CLOCKS_PER_SEC);
+
+ printf("Done\n The biggest value found is %d\n", hv);
+ free(data);
+ free(matrix);
+ return 0;
+}
\ No newline at end of file