ROCm · r-abishek · Sep 4, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/docs/data/spectrogramOutput.png b/docs/data/spectrogramOutput.png
diff --git a/utilities/test_suite/HIP/Tensor_audio_hip.cpp b/utilities/test_suite/HIP/Tensor_audio_hip.cpp
@@ -124,12 +124,12 @@ int main(int argc, char **argv)
     }
 
     // compute maximum possible buffer size of resample
-    Rpp64u resampleMaxBufferSize = dstDescPtr->n * dstDescPtr->strides.nStride * 1.15;
+    Rpp64u resampleMaxBufferSize = static_cast<Rpp64u>(oBufferSize * RESAMPLE_BUFFER_SCALE_FACTOR);
     if (testCase == RESAMPLE)
         oBufferSize = resampleMaxBufferSize;
 
     // compute maximum possible buffer size of spectrogram
-    Rpp64u spectrogramMaxBufferSize = 257 * 3754 * dstDescPtr->n;
+    Rpp64u spectrogramMaxBufferSize = SPECTROGRAM_MAX_HEIGHT * SPECTROGRAM_MAX_WIDTH * dstDescPtr->n;
     if (testCase == SPECTROGRAM)
         oBufferSize = spectrogramMaxBufferSize;
 
@@ -300,11 +300,13 @@ int main(int argc, char **argv)
                 {
                     testCaseName = "resample";
 
+                    Rpp32u sampleRate = 16000;
+                    Rpp32f upsampleRatio = 1.15f;
                     maxDstWidth = 0;
                     for(int i = 0, j = 0; i < batchSize; i++, j += 2)
                     {
-                        inRateTensor[i] = 16000;
-                        outRateTensor[i] = 16000 * 1.15f;
+                        inRateTensor[i] = sampleRate;
+                        outRateTensor[i] = sampleRate * upsampleRatio;
                         Rpp32f scaleRatio = outRateTensor[i] / inRateTensor[i];
                         srcDimsTensor[j] = srcLengthTensor[i];
                         srcDimsTensor[j + 1] = channelsTensor[i];
@@ -348,11 +350,11 @@ int main(int argc, char **argv)
                     Rpp32s numFilter = 80;
                     bool normalize = true;
                     srcDimsTensor[0] = 257;
-                    srcDimsTensor[1] = 225;
+                    srcDimsTensor[1] = 3170;
                     srcDimsTensor[2] = 257;
-                    srcDimsTensor[3] = 211;
+                    srcDimsTensor[3] = 552;
                     srcDimsTensor[4] = 257;
-                    srcDimsTensor[5] = 214;
+                    srcDimsTensor[5] = 1131;
 
                     init_mel_filter_bank(&inputf32, &outputf32, srcDescPtr, dstDescPtr, dstDims, offsetInBytes, numFilter, batchSize, srcDimsTensor, scriptPath, testType);
 
@@ -375,7 +377,7 @@ int main(int argc, char **argv)
             if (missingFuncFlag == 1)
             {
                 cout << "\nThe functionality " << func << " doesn't yet exist in RPP\n";
-                return -1;
+                return RPP_ERROR_NOT_IMPLEMENTED;
             }
 
             wallTime = endWallTime - startWallTime;
@@ -395,7 +397,7 @@ int main(int argc, char **argv)
             if (testCase != NON_SILENT_REGION_DETECTION)
                 verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath, "HIP");
             else
-                verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
+                verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, scriptPath, dst);
 
             /* Dump the outputs to csv files for debugging
             Runs only if

diff --git a/utilities/test_suite/HOST/Tensor_audio_host.cpp b/utilities/test_suite/HOST/Tensor_audio_host.cpp
@@ -121,7 +121,7 @@ int main(int argc, char **argv)
         descriptorPtr3D->offsetInBytes = 0;
         descriptorPtr3D->dataType = RpptDataType::F32;
         descriptorPtr3D->dims[0] = batchSize;
-        descriptorPtr3D->dims[1] = maxSrcWidth;
+        descriptorPtr3D->dims[1] = (maxSrcWidth + 7) & ~7; // Ensure a consistent dimension order between generic and typed descriptors to prevent errors.
         descriptorPtr3D->strides[0] = descriptorPtr3D->dims[1];
     }
 
@@ -138,12 +138,12 @@ int main(int argc, char **argv)
     }
 
     // compute maximum possible buffer size of resample
-    Rpp64u resampleMaxBufferSize = dstDescPtr->n * dstDescPtr->strides.nStride * 1.15;
+    Rpp64u resampleMaxBufferSize = dstDescPtr->n * dstDescPtr->strides.nStride * RESAMPLE_BUFFER_SCALE_FACTOR;
     if (testCase == RESAMPLE)
         oBufferSize = resampleMaxBufferSize;
 
     // compute maximum possible buffer size of spectrogram
-    Rpp64u spectrogramMaxBufferSize = 257 * 3754 * dstDescPtr->n;
+    Rpp64u spectrogramMaxBufferSize = SPECTROGRAM_MAX_HEIGHT * SPECTROGRAM_MAX_WIDTH * dstDescPtr->n;
     if (testCase == SPECTROGRAM)
         oBufferSize = spectrogramMaxBufferSize;
 
@@ -320,11 +320,13 @@ int main(int argc, char **argv)
                     Rpp32f outRateTensor[batchSize];
                     Rpp32s srcDimsTensor[batchSize * 2];
 
+                    Rpp32u sampleRate = 16000;
+                    Rpp32f upsampleRatio = 1.15f;
                     maxDstWidth = 0;
                     for(int i = 0, j = 0; i < batchSize; i++, j += 2)
                     {
-                        inRateTensor[i] = 16000;
-                        outRateTensor[i] = 16000 * 1.15f;
+                        inRateTensor[i] = sampleRate;
+                        outRateTensor[i] = sampleRate * upsampleRatio;
                         Rpp32f scaleRatio = outRateTensor[i] / inRateTensor[i];
                         srcDimsTensor[j] = srcLengthTensor[i];
                         srcDimsTensor[j + 1] = channelsTensor[i];
@@ -364,7 +366,7 @@ int main(int argc, char **argv)
                     Rpp32s numFilter = 80;
                     bool normalize = true;
                     // (height, width) for each tensor in a batch for given QA inputs.
-                    Rpp32s srcDimsTensor[] = {257, 225, 257, 211, 257, 214};
+                    Rpp32s srcDimsTensor[] = {257, 3170, 257, 552, 257, 1131};
 
                     init_mel_filter_bank(&inputf32, &outputf32, srcDescPtr, dstDescPtr, dstDims, offsetInBytes, numFilter, batchSize, srcDimsTensor, scriptPath, testType);
 
@@ -384,7 +386,7 @@ int main(int argc, char **argv)
             if (missingFuncFlag == 1)
             {
                 cout << "\nThe functionality " << func << " doesn't yet exist in RPP\n";
-                return -1;
+                return RPP_ERROR_NOT_IMPLEMENTED;
             }
 
             wallTime = endWallTime - startWallTime;
@@ -397,7 +399,7 @@ int main(int argc, char **argv)
         if (testType == 0)
         {
             if (testCase == NON_SILENT_REGION_DETECTION)
-                verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
+                verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, scriptPath, dst);
             else
                 verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath, "HOST");
 

diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/down_mixing/down_mixing.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/down_mixing/down_mixing.bin
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/mel_filter_bank/mel_filter_bank.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/mel_filter_bank/mel_filter_bank.bin
diff --git a/...suite/REFERENCE_OUTPUTS_AUDIO/non_silent_region_detection/non_silent_region_detection.bin b/...suite/REFERENCE_OUTPUTS_AUDIO/non_silent_region_detection/non_silent_region_detection.bin
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/pre_emphasis_filter/pre_emphasis_filter.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/pre_emphasis_filter/pre_emphasis_filter.bin
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/resample/resample.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/resample/resample.bin
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/slice/slice.bin
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/spectrogram/spectrogram.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/spectrogram/spectrogram.bin
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/to_decibels/to_decibels.bin b/utilities/test_suite/REFERENCE_OUTPUTS_AUDIO/to_decibels/to_decibels.bin
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample1.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample1.wav
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample2.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample2.wav
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample3.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_sample_multi_channel_src1/sample3.wav
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_samples_single_channel_src1/sample1.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_samples_single_channel_src1/sample1.wav
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_samples_single_channel_src1/sample2.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_samples_single_channel_src1/sample2.wav
diff --git a/utilities/test_suite/TEST_AUDIO_FILES/three_samples_single_channel_src1/sample3.wav b/utilities/test_suite/TEST_AUDIO_FILES/three_samples_single_channel_src1/sample3.wav
diff --git a/utilities/test_suite/rpp_test_suite_audio.h b/utilities/test_suite/rpp_test_suite_audio.h
@@ -35,7 +35,10 @@ SOFTWARE.
 #include <sndfile.h>
 using namespace std;
 
-#define MEL_FILTER_BANK_MAX_HEIGHT 257 // Maximum height for mel filter bank set to 257 to ensure compatibility with test configuration
+#define MEL_FILTER_BANK_MAX_HEIGHT   257  // Maximum height for mel filter bank set to 257 to ensure compatibility with test configuration
+#define RESAMPLE_BUFFER_SCALE_FACTOR 1.15 // Scale factor to allocate a safe maximum buffer size for resampling, allowing for upsampling
+#define SPECTROGRAM_MAX_HEIGHT       257  // Maximum height for spectrogram set to 257 to ensure compatibility with test configuration, calculated as (nfft / 2) + 1 for a standard nfft of 512
+#define SPECTROGRAM_MAX_WIDTH        3170 // Maximum width for a spectrogram, pre-calculated based on the longest audio file in the test dataset
 
 std::map<int, string> audioAugmentationMap =
 {
@@ -60,14 +63,6 @@ enum Augmentation {
     MEL_FILTER_BANK = 7
 };
 
-// Golden outputs for Non Silent Region Detection
-std::map<string, std::vector<int>> NonSilentRegionReferenceOutputs =
-{
-    {"sample1", {0, 35840}},
-    {"sample2", {0, 33680}},
-    {"sample3", {0, 34160}}
-};
-
 // Cutoff values for audio kernels listed for HOST backend followed by HIP
 static const std::map<string, std::vector<double>> audioCutOff =
 {
@@ -251,7 +246,6 @@ void replicate_src_dims_to_fill_batch(Rpp32s *srcDimsTensor, int numSamples, int
 // Compares output with reference outputs and validates QA
 void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dstDims, string testCase, string dst, string scriptPath, string backend)
 {
-    fstream refFile;
     int fileMatch = 0;
 
     // read data from golden outputs
@@ -343,24 +337,26 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst
 }
 
 // Compares output with reference outputs and validates QA for non silent region
-void verify_non_silent_region_detection(int *detectedIndex, int *detectionLength, string testCase, int bs, vector<string> audioNames, string dst)
+void verify_non_silent_region_detection(int *detectedIndex, int *detectionLength, string testCase, int bs, string scriptPath, string dst)
 {
     int fileMatch = 0;
+    // read data from golden outputs
+    string outFile = scriptPath + "/../REFERENCE_OUTPUTS_AUDIO/" + testCase + "/" + testCase + ".bin";
+    std::fstream fin(outFile, std::ios::in | std::ios::binary);
+    if(!fin.is_open())
+    {
+        cout << "\nUnable to get the reference outputs for the file specified!" << endl;
+        return;
+    }
+    Rpp32s *refOutput = (Rpp32s *)malloc(bs * 2 * sizeof(Rpp32s));
+    fin.read(reinterpret_cast<char*>(refOutput), bs * 2 * sizeof(Rpp32s));
+
     for (int i = 0; i < bs; i++)
     {
-        string currentFileName = audioNames[i];
-        size_t lastIndex = currentFileName.find_last_of(".");
-        currentFileName = currentFileName.substr(0, lastIndex);  // Remove extension from file name
-        std::vector<int> referenceOutput = NonSilentRegionReferenceOutputs[currentFileName];
-        if(referenceOutput.empty())
-        {
-            cout << "\nUnable to get the reference outputs for the file specified!" << endl;
-            break;
-        }
         Rpp32s outBegin = detectedIndex[i];
         Rpp32s outLength = detectionLength[i];
-        Rpp32s refBegin = referenceOutput[0];
-        Rpp32s refLength = referenceOutput[1];
+        Rpp32s refBegin = refOutput[i * 2];
+        Rpp32s refLength = refOutput[i * 2 + 1];
 
         if ((outBegin == refBegin) && (outLength == refLength))
             fileMatch += 1;
@@ -385,6 +381,8 @@ void verify_non_silent_region_detection(int *detectedIndex, int *detectionLength
         qaResults << status << std::endl;
         qaResults.close();
     }
+
+    free(refOutput);
 }
 
 inline Rpp32f sinc(Rpp32f x)