A[m,n] + B[m,n] = C[m,n]where m and n are the dimensions of the matrices.
float mat_a[NUM_ROW][NUM_COL]; float mat_b[NUM_ROW][NUM_COL]; float mat_c[NUM_ROW][NUM_COL]; int main(void) { int i,j; for (i=0; i<NUM_ROW; i++) for (j=0; j<NUM_COL; j++) mat_c[i][j] = mat_a[i][j] + mat_b[i][j]; return 0; }An ALF host program can be logically divided into several sections:
matrix_add/STEP1a_partition_scheme_A/common/host_partition
alf_handle_t alf_handle; unsigned int nodes; /* initializes the runtime environment for ALF*/ alf_init(&config_parms, &alf_handle;); /* get the number of SPE accelerators available for from the Opteron */ rc = alf_query_system_info(alf_handle, ALF_QUERY_NUM_ACCEL, ALF_ACCEL_TYPE_SPE, &nodes;); /* set the total number of accelerator instances (in this case, SPE) */ /* the ALF runtime will have during its lifetime */ rc = alf_num_instances_set (alf_handle, nodes);
/* variable declarations */ alf_task_desc_handle_t task_desc_handle; alf_task_handle_t task_handle; const char* spe_image_name; const char* library_path_name; const char* comp_kernel_name; /* describing a task that's executable on the SPE*/ alf_task_desc_create(alf_handle, ALF_ACCEL_TYPE_SPE, &task_desc_handle;); alf_task_desc_set_int32(task_desc_handle, ALF_TASK_DESC_TSK_CTX_SIZE, 0); alf_task_desc_set_int32(task_desc_handle, ALF_TASK_DESC_WB_PARM_CTX_BUF_SIZE, sizeof(add_parms_t)); alf_task_desc_set_int32(task_desc_handle, ALF_TASK_DESC_WB_IN_BUF_SIZE, H * V * 2 sizeof(float)); alf_task_desc_set_int32(task_desc_handle, ALF_TASK_DESC_WB_OUT_BUF_SIZE, H * V * sizeof(float)); alf_task_desc_set_int32(task_desc_handle, ALF_TASK_DESC_NUM_DTL_ENTRIES, 8); alf_task_desc_set_int32(task_desc_handle, ALF_TASK_DESC_MAX_STACK_SIZE, 4096); /* providing the SPE executable name */ alf_task_desc_set_int64(task_desc_handle, ALF_TASK_DESC_ACCEL_IMAGE_REF_L,(unsigned long long) spe_image_name); alf_task_desc_set_int64(task_desc_handle, ALF_TASK_DESC_ACCEL_LIBRARY_REF_L,(unsigned long) library_path_name); alf_task_desc_set_int64(task_desc_handle, ALF_TASK_DESC_ACCEL_KERNEL_REF_L,(unsigned long) comp_kernel_name);
This section shows how work blocks are created. After the program has created the work block, it describes the input and output associated with each work block. Each work block contains the input description for blocks in the input matrices of size H * V starting at location matrix[row][0] with H and V representing the horizontal and vertical dimensions of the block.
alf_wb_handle_t wb_handle; add_parms_t parm __attribute__((aligned(128))); parm.h = H; /* horizontal size of the block */ parm.v = V; /* vertical size of the block */ /* creating work blocks and adding param & io buffer */ for (i = 0; i < NUM_ROW; i += H) { alf_wb_create(task_handle, ALF_WB_SINGLE, 0,&wb_handle); /* begins a new Data Transfer List for INPUT */ alf_wb_dtl_set_begin(wb_handle, ALF_BUF_IN, 0); /* Add H*V element of mat_a as Input */ alf_wb_dtl_set_entry_add(wb_handle, &matrix_a[i][0], H * V, ALF_DATA_FLOAT); /* Add H*V element of mat_b as Input */ alf_wb_dtl_set_entry_add(wb_handle, &matrix_b[i][0], H * V, ALF_DATA_FLOAT); alf_wb_dtl_set_end(wb_handle); /* begins a new Data Transfer List OUTPUT */ alf_wb_dtl_set_begin(wb_handle, ALF_BUF_OUT, 0); /* Add H*V element of mat_c as Output */ alf_wb_dtl_set_entry_add(wb_handle, &matrix_c[i][0], H * V, ALF_DATA_FLOAT); alf_wb_dtl_set_end(wb_handle); /* pass parameters H and V to spu */ alf_wb_parm_add(wb_handle, (void *) (&parm), sizeof(parm), ALF_DATA_BYTE, 0); /* enqueuing work block */ alf_wb_enqueue(wb_handle); } alf_task_finalize(task_handle);
/* waiting for all work blocks to be done*/ alf_task_wait(task_handle, -1); /* exit ALF runtime */ alf_exit(alf_handle, ALF_EXIT_WAIT, -1);
int alf_accel_comp_kernel(void *p_task_context, void *p_parm_context, void *p_input_buffer, void *p_output_buffer, void *p_inout_buffer, unsigned int current_count, unsigned int total_count) { unsigned int i, cnt; vector float *sa, *sb, *sc; add_parms_t *p_parm = (add_parms_t *)p_parm_context; cnt = p_parm->h * p_parm->v / 4; sa = (vector float *) p_input_buffer; sb = sa + cnt; sc = (vector float *) p_output_buffer; for (i = 0; i < cnt; i += 4) { sc[i] = spu_add(sa[i], sb[i]); sc[i + 1] = spu_add(sa[i + 1], sb[i + 1]); sc[i + 2] = spu_add(sa[i + 2], sb[i + 2]); sc[i + 3] = spu_add(sa[i + 3], sb[i + 3]); } return 0; }