12 VEC_ARRAY_1D(
v8ph_t, _local_data_hp, local_data);
13 VEC_ARRAY_1D(
v8fp_t, _local_data_sp, local_data);
14 const int page_size = (1 << LOG_PAGE_SIZE);
15 const int max_transfer_size = page_size;
16 const int total_bytes =
18 int num_xfers =
FRAC_CEIL(total_bytes, max_transfer_size);
19 int num_bytes_remaining = total_bytes;
21 for (
int i = 0; i < num_xfers; i++) {
22 int transfer_size = min2(num_bytes_remaining, max_transfer_size);
23 int curr_offset = (i * page_size * 2) /
sizeof(
float);
24 hostLoad(local_data + local_offset + curr_offset,
25 remote_data + remote_offset + curr_offset,
34 int page_offset_vec = (local_offset + curr_offset) /
VECTOR_SIZE;
36 for (
int v = num_vectors - 1; v >= 0; v--) {
37 v8ph_t fp16_data = _local_data_hp[page_offset_vec * 2 + v];
38 v8fp_t fp32_data = _CVT_PH_PS_256(fp16_data);
39 _local_data_sp[page_offset_vec + v] = fp32_data;
41 num_bytes_remaining -= transfer_size;
50 VEC_ARRAY_1D(
v8ph_t, _local_data_hp, local_data);
51 VEC_ARRAY_1D(
v8fp_t, _local_data_sp, local_data);
52 const int page_size = (1 << LOG_PAGE_SIZE);
53 const int max_transfer_size = page_size;
54 const int total_bytes =
56 int num_xfers =
FRAC_CEIL(total_bytes, max_transfer_size);
57 int num_bytes_remaining = total_bytes;
59 for (
int i = 0; i < num_xfers; i++) {
60 int transfer_size = min2(num_bytes_remaining, max_transfer_size);
62 int eff_transfer_size = transfer_size * 2;
63 int curr_offset = (i * 2 * page_size) /
sizeof(
float);
67 int page_offset_vec = (local_offset + curr_offset) /
VECTOR_SIZE;
69 for (
int v = 0; v < num_vectors; v++){
70 v8fp_t fp32_data = _local_data_sp[page_offset_vec + v];
71 v8ph_t fp16_data = _CVT_PS_PH_256(fp32_data, 0);
72 _local_data_hp[page_offset_vec * 2 + v] = fp16_data;
75 hostStore(remote_data + remote_offset + curr_offset,
76 local_data + local_offset + curr_offset,
79 num_bytes_remaining -= transfer_size;