Add Transpose Layer for 2D matrix transposition in 4D tensors (davisking#3013)

Cydral · web-flow · commit 90c8d7830f83 · 2024-09-23T07:26:02.000-04:00
diff --git a/dlib/cuda/cpu_dlib.cpp b/dlib/cuda/cpu_dlib.cpp
@@ -2927,7 +2927,47 @@ namespace dlib
         }
 
     // ------------------------------------------------------------------------------------
-    // ------------------------------------------------------------------------------------
+
+        void transpose(
+            bool add,
+            tensor& dest,
+            const tensor& src            
+        )
+        {
+            DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+                dest.k() == src.k() &&
+                dest.nr() == src.nc() &&
+                dest.nc() == src.nr(),
+                "Incompatible tensor dimensions.");
+
+            const float* src_data = src.host();
+            float* dest_data = dest.host();
+
+            const long num_samples = src.num_samples();
+            const long k_dim = src.k();
+            const long src_nr = src.nr();
+            const long src_nc = src.nc();
+            const long dest_nr = dest.nr();
+            const long dest_nc = dest.nc();
+
+            parallel_for(0, num_samples * k_dim, [&](long i) {
+                const long n = i / k_dim;
+                const long k = i % k_dim;
+                const long src_nk_offset = (n * src.k() + k) * src_nr;
+                const long dest_nk_offset = (n * dest.k() + k) * dest_nr;
+
+                for (long r = 0; r < src_nr; ++r) {
+                    for (long c = 0; c < src_nc; ++c) {
+                        const long src_idx = (src_nk_offset + r) * src_nc + c;
+                        const long dest_idx = (dest_nk_offset + c) * dest_nc + r;
+
+                        if (add) dest_data[dest_idx] += src_data[src_idx];
+                        else dest_data[dest_idx] = src_data[src_idx];
+                    }
+                }
+            });
+        }
+
     // ------------------------------------------------------------------------------------
 
     } 
diff --git a/dlib/cuda/cpu_dlib.h b/dlib/cuda/cpu_dlib.h
@@ -671,6 +671,14 @@ namespace dlib
             size_t count_k
         );
 
+    // -----------------------------------------------------------------------------------
+
+        void transpose(
+            bool add_to,
+            tensor& dest,
+            const tensor& src
+        );
+
     // -----------------------------------------------------------------------------------
 
     class compute_loss_binary_log_per_pixel
diff --git a/dlib/cuda/cuda_dlib.cu b/dlib/cuda/cuda_dlib.cu
@@ -2500,6 +2500,46 @@ namespace dlib
 
     // ----------------------------------------------------------------------------------------
 
+        __global__ void _cuda_transpose(size_t dsize, size_t dk, size_t dnr, size_t dnc, float* d,
+            size_t sk, size_t snr, int snc, const float* s, const bool add_to)
+        {
+            const auto plane_size = dnr * dnc;
+            const auto sample_size = dk * plane_size;
+            for (auto i : grid_stride_range(0, dsize))
+            {
+                const auto n = i / sample_size;
+                const auto idx = i % plane_size;
+                const auto in_k = (i / plane_size) % dk;
+                const auto in_r = idx % dnc;
+                const auto in_c = idx / dnc;
+
+                const auto in_idx = ((n * sk + in_k) * snr + in_r) * snc + in_c;
+                if (add_to) d[i] += s[in_idx];
+                else d[i] = s[in_idx];
+            }
+        }
+
+        void transpose(
+            bool add_to,
+            tensor& dest,
+            const tensor& src            
+        )
+        {
+            DLIB_CASSERT(is_same_object(dest, src) == false);
+            DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
+                dest.k() == src.k() &&
+                dest.nr() == src.nc() &&
+                dest.nc() == src.nr(),
+                "Incompatible tensor dimensions.");
+
+            launch_kernel(_cuda_transpose, max_jobs(dest.size()), dest.size(),
+                dest.k(), dest.nr(), dest.nc(), dest.device(),
+                src.k(), src.nr(), src.nc(), src.device(), add_to);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+
         __device__ float cuda_log1pexp(float x)
         {
             if (x <= -18)
diff --git a/dlib/cuda/cuda_dlib.h b/dlib/cuda/cuda_dlib.h
@@ -570,6 +570,13 @@ namespace dlib
             size_t count_k
         );
 
+    // ----------------------------------------------------------------------------------------
+
+        void transpose(
+            bool add_to,
+            tensor& dest,
+            const tensor& src
+        );
 
     // ----------------------------------------------------------------------------------------
 
diff --git a/dlib/cuda/tensor_tools.cpp b/dlib/cuda/tensor_tools.cpp
@@ -1279,6 +1279,21 @@ namespace dlib { namespace tt
 #endif
     }
 
+// ----------------------------------------------------------------------------------------
+
+    void transpose(
+        bool add_to,
+        tensor& dest,
+        const tensor& src
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::transpose(add_to, dest, src);
+#else
+        cpu::transpose(add_to, dest, src);
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------
 
 }}
diff --git a/dlib/cuda/tensor_tools.h b/dlib/cuda/tensor_tools.h
@@ -2186,6 +2186,32 @@ namespace dlib { namespace tt
                   i.e., copies content of each sample from src in to corresponding place of sample at dest.
     !*/
 
+// ----------------------------------------------------------------------------------------
+
+    void transpose(
+        bool add_to,
+        tensor& dest,
+        const tensor& src
+    );
+    /*!
+        requires
+            - is_same_object(dest, src) == false
+            - dest.num_samples() == src.num_samples()
+            - dest.k() == src.k()
+            - dest.nr() == src.nc()
+            - dest.nc() == src.nr()            
+        ensures
+            - Performs a transpose operation on the nr() x nc() matrices within src.
+            - If (add_to) is false:
+                - The result is stored in dest, overwriting its previous contents.
+                - For all valid n, k, r, c:
+                    - #dest(n,k,c,r) == src(n,k,r,c)
+            - If (add_to) is true:
+                - The result is added to the existing contents of dest.
+                - For all valid n, k, r, c:
+                    - #dest(n,k,c,r) == dest(n,k,c,r) + src(n,k,r,c)
+    !*/
+
 // ----------------------------------------------------------------------------------------
 
 }}
diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
@@ -4635,6 +4635,67 @@ namespace dlib
     template <typename SUBNET>
     using reorg = add_layer<reorg_<2, 2>, SUBNET>;
 
+// ----------------------------------------------------------------------------------------
+
+    class transpose_ {
+    public:
+        transpose_() {}
+        template <typename SUBNET> void setup(const SUBNET& /* sub */) {}
+
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output) {
+            auto& prev = sub.get_output();
+
+            output.set_size(prev.num_samples(), prev.k(), prev.nc(), prev.nr());
+            tt::transpose(false, output, prev);           
+        }
+
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) {
+            auto& prev = sub.get_gradient_input();
+            tt::transpose(true, prev, gradient_input);
+        }
+
+        inline dpoint map_input_to_output(dpoint p) const
+        {
+            dpoint temp_p;
+            temp_p.x() = p.y();
+            temp_p.y() = p.x();
+            return temp_p;
+        }
+        inline dpoint map_output_to_input(dpoint p) const
+        {
+            dpoint temp_p;
+            temp_p.x() = p.y();
+            temp_p.y() = p.x();
+            return temp_p;
+        }
+
+        const tensor& get_layer_params() const { return params; }
+        tensor& get_layer_params() { return params; }
+
+        friend void serialize(const transpose_& /* item */, std::ostream& out) {
+            serialize("transpose_", out);
+        }
+        friend void deserialize(transpose_& /* item */, std::istream& in) {
+            std::string version;
+            deserialize(version, in);
+            if (version != "transpose_")
+                throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::transpose_.");
+        }
+
+        friend std::ostream& operator<<(std::ostream& out, const transpose_& /* item */) {
+            out << "transpose";
+            return out;
+        }
+        friend void to_xml(const transpose_& /* item */, std::ostream& out) {
+            out << "<transpose />\n";
+        }
+
+    private:
+        dlib::resizable_tensor params; // unused
+    };
+
+    template <typename SUBNET> using transpose = add_layer<transpose_, SUBNET>;
+
 // ----------------------------------------------------------------------------------------
 
 }
diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h
@@ -3649,6 +3649,60 @@ namespace dlib
     template <typename SUBNET>
     using reorg = add_layer<reorg_<2, 2>, SUBNET>;
 
+// ----------------------------------------------------------------------------------------
+
+    class transpose_
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
+                defined above. In particular, this layer performs a 2D matrix transposition
+                on each of the k planes within each sample of a 4D tensor.
+
+                The dimensions of the tensor output by this layer are as follows (letting
+                IN be the input tensor and OUT the output tensor):
+                    - OUT.num_samples() == IN.num_samples()
+                    - OUT.k()  == IN.k()
+                    - OUT.nr() == IN.nc()
+                    - OUT.nc() == IN.nr()
+
+                The transposition is performed as follows:
+                    - For each sample i and each k-plane j:
+                        - OUT[i][j][r][c] = IN[i][j][c][r] for all r in [0, IN.nc()) and c in [0, IN.nr())
+
+                This layer does not have any learnable parameters.
+        !*/
+
+    public:
+
+        transpose_() = default;
+
+        template <typename SUBNET> void setup (const SUBNET& sub);
+        template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
+        template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);
+        
+        inline dpoint map_input_to_output(dpoint p) const;
+        inline dpoint map_output_to_input(dpoint p) const;
+
+        const tensor& get_layer_params() const; 
+        tensor& get_layer_params(); 
+
+        friend void serialize(const transpose_& item, std::ostream& out);
+        friend void deserialize(transpose_& item, std::istream& in);
+
+        friend std::ostream& operator<<(std::ostream& out, const transpose_& item);
+        friend void to_xml(const transpose_& item, std::ostream& out);
+
+        /*!
+            These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
+        !*/
+    private:
+        resizable_tensor params; // unused
+    };
+
+    template <typename SUBNET>
+    using transpose = add_layer<transpose_, SUBNET>;
+
 // ----------------------------------------------------------------------------------------
 
 }
diff --git a/dlib/dnn/visitors.h b/dlib/dnn/visitors.h
@@ -1021,6 +1021,14 @@ namespace dlib
                 update(i);
             }
 
+            template <typename U, typename E>
+            void operator()(size_t i, const add_layer<transpose_, U, E>&)
+            {
+                start_node(i, "transpose");
+                end_node();
+                update(i);
+            }
+
             template <typename T, typename U, typename E>
             void operator()(size_t i, const add_layer<T, U, E>&)
             {
diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp
@@ -748,6 +748,37 @@ namespace
 #endif        
     }
 
+// ----------------------------------------------------------------------------------------
+
+    void test_transpose()
+    {
+        const long num_samples = 2;
+        const long k = 3;
+        const long nr = 4;
+        const long nc = 5;
+
+        resizable_tensor input(num_samples, k, nr, nc);
+        resizable_tensor output_cpu_a(num_samples, k, nc, nr);    
+        tt::tensor_rand rnd(0);
+        rnd.fill_uniform(input);
+        resizable_tensor output_cpu_b(input);
+
+        cpu::transpose(false, output_cpu_a, input);
+        cpu::transpose(true, output_cpu_b, output_cpu_a);
+        input *= 2;
+        DLIB_TEST(max(abs(mat(output_cpu_b) - mat(input))) < 1e-5);
+
+#ifdef DLIB_USE_CUDA
+        input /= 2;
+        resizable_tensor output_cuda_a, output_cuda_b(input);    
+        output_cuda_a.copy_size(output_cpu_a);
+        cuda::transpose(false, output_cuda_a, input);
+        cuda::transpose(true, output_cuda_b, output_cuda_a);
+        DLIB_TEST(max(abs(mat(output_cpu_a) - mat(output_cuda_a))) < 1e-5);
+        DLIB_TEST(max(abs(mat(output_cpu_b) - mat(output_cuda_b))) < 1e-5);
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------
 
     void test_basic_tensor_ops()
@@ -2280,6 +2311,12 @@ namespace
             auto res = test_layer(l);
             DLIB_TEST_MSG(res, res);
         }
+        {
+            print_spinner();
+            transpose_ l;
+            auto res = test_layer(l);
+            DLIB_TEST_MSG(res, res);
+        }        
     }
 
 // ----------------------------------------------------------------------------------------
@@ -4489,6 +4526,7 @@ namespace
             test_batch_normalize_conv();
             test_layer_normalize();
             test_rms_normalize();
+            test_transpose();
             test_basic_tensor_ops();
             test_layers();
             test_visit_functions();

Original file line number	Diff line number	Diff line change
`@@ -1021,6 +1021,14 @@ namespace dlib`
`1021`	`1021`	`update(i);`
`1022`	`1022`	`}`
`1023`	`1023`
	`1024`	`+ template <typename U, typename E>`
	`1025`	`+ void operator()(size_t i, const add_layer<transpose_, U, E>&)`
	`1026`	`+ {`
	`1027`	`+ start_node(i, "transpose");`
	`1028`	`+ end_node();`
	`1029`	`+ update(i);`
	`1030`	`+ }`
	`1031`	`+`
`1024`	`1032`	`template <typename T, typename U, typename E>`
`1025`	`1033`	`void operator()(size_t i, const add_layer<T, U, E>&)`
`1026`	`1034`	`{`