-
Notifications
You must be signed in to change notification settings - Fork 13.9k
Add circular tiling support to pad, for Vulkan, CUDA, and CPU (used for making seamless textures) #16985
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Add circular tiling support to pad, for Vulkan, CUDA, and CPU (used for making seamless textures) #16985
Changes from 10 commits
f6ac084
d7f5958
1b62b49
60bed3b
5700a4e
a894631
9861a3d
38f8724
d4a664b
a785537
d9dc234
552e5b2
1c69e4e
3cd8167
429854b
cf720e8
b65967a
a0bbbc2
c9513b4
df6635f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -1,9 +1,18 @@ | ||||
| #include <stdint.h> | ||||
|
|
||||
| #include "pad.cuh" | ||||
|
|
||||
|
|
||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
|
|
||||
| __device__ __forceinline__ int64_t wrap_coord(int64_t coord, int64_t size) { | ||||
| return (coord % size + size) % size; | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think |
||||
| } | ||||
|
|
||||
| static __global__ void pad_f32(const float * src, float * dst, | ||||
| const int lp0, const int rp0, const int lp1, const int rp1, | ||||
| const int lp2, const int rp2, const int lp3, const int rp3, | ||||
| const int ne0, const int ne1, const int ne2, const int ne3) { | ||||
| const int ne0, const int ne1, const int ne2, const int ne3, | ||||
| const int circular) { | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think wrap_around is a better name? But I'm okay either way |
||||
| // blockIdx.z: i3*ne2+i2 | ||||
| // blockIdx.y: i1 | ||||
| // blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE | ||||
|
|
@@ -12,39 +21,59 @@ static __global__ void pad_f32(const float * src, float * dst, | |||
| int i1 = blockIdx.y; | ||||
| int i2 = blockIdx.z % ne2; | ||||
| int i3 = blockIdx.z / ne2; | ||||
|
|
||||
| if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { | ||||
| return; | ||||
| } | ||||
|
|
||||
| // operation | ||||
|
|
||||
| const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; | ||||
| if ((i0 >= lp0 && i0 < ne0 - rp0) && | ||||
| (i1 >= lp1 && i1 < ne1 - rp1) && | ||||
| (i2 >= lp2 && i2 < ne2 - rp2) && | ||||
| (i3 >= lp3 && i3 < ne3 - rp3)) { | ||||
| const int64_t i00 = i0 - lp0; | ||||
| const int64_t i01 = i1 - lp1; | ||||
| const int64_t i02 = i2 - lp2; | ||||
| const int64_t i03 = i3 - lp3; | ||||
| const int64_t ne02 = ne2 - lp2 - rp2; | ||||
| const int64_t ne01 = ne1 - lp1 - rp1; | ||||
|
|
||||
| if (circular == 0) { | ||||
| // operation | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please leave a more descriptive comment about circular |
||||
| if ((i0 >= lp0 && i0 < ne0 - rp0) && | ||||
| (i1 >= lp1 && i1 < ne1 - rp1) && | ||||
| (i2 >= lp2 && i2 < ne2 - rp2) && | ||||
| (i3 >= lp3 && i3 < ne3 - rp3)) { | ||||
| const int64_t i00 = i0 - lp0; | ||||
| const int64_t i01 = i1 - lp1; | ||||
| const int64_t i02 = i2 - lp2; | ||||
| const int64_t i03 = i3 - lp3; | ||||
| const int64_t ne02 = ne2 - lp2 - rp2; | ||||
| const int64_t ne01 = ne1 - lp1 - rp1; | ||||
| const int64_t ne00 = ne0 - lp0 - rp0; | ||||
|
|
||||
| const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00; | ||||
|
|
||||
| dst[dst_idx] = src[src_idx]; | ||||
| } else { | ||||
| dst[dst_idx] = 0.0f; | ||||
| } | ||||
| } | ||||
| else { | ||||
| const int64_t ne00 = ne0 - lp0 - rp0; | ||||
| const int64_t ne01 = ne1 - lp1 - rp1; | ||||
| const int64_t ne02 = ne2 - lp2 - rp2; | ||||
| const int64_t ne03 = ne3 - lp3 - rp3; | ||||
|
|
||||
| const int64_t i00 = wrap_coord(i0 - lp0, ne00); | ||||
| const int64_t i01 = wrap_coord(i1 - lp1, ne01); | ||||
| const int64_t i02 = wrap_coord(i2 - lp2, ne02); | ||||
| const int64_t i03 = wrap_coord(i3 - lp3, ne03); | ||||
|
|
||||
| const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00; | ||||
|
|
||||
| dst[dst_idx] = src[src_idx]; | ||||
| } else { | ||||
| dst[dst_idx] = 0.0f; | ||||
| } | ||||
| } | ||||
|
|
||||
| static void pad_f32_cuda(const float * src, float * dst, | ||||
| const int lp0, const int rp0, const int lp1, const int rp1, | ||||
| const int lp2, const int rp2, const int lp3, const int rp3, | ||||
| const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { | ||||
| const int ne0, const int ne1, const int ne2, const int ne3, | ||||
| const int circular, cudaStream_t stream) { | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. circular should be bool |
||||
| int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; | ||||
| dim3 gridDim(num_blocks, ne1, ne2*ne3); | ||||
| pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3); | ||||
| pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3, circular); | ||||
| } | ||||
|
|
||||
| void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | ||||
|
|
@@ -65,8 +94,9 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | |||
| const int32_t rp2 = ((const int32_t*)(dst->op_params))[5]; | ||||
| const int32_t lp3 = ((const int32_t*)(dst->op_params))[6]; | ||||
| const int32_t rp3 = ((const int32_t*)(dst->op_params))[7]; | ||||
| const int32_t circular = ((const int32_t*)(dst->op_params))[8]; | ||||
|
|
||||
| pad_f32_cuda(src0_d, dst_d, | ||||
| lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, | ||||
| dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); | ||||
| dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], circular, stream); | ||||
| } | ||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same, clang-format