mongodb · marksg07 · Jan 22, 2025 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
@@ -120,6 +120,7 @@ set (MONGOCRYPT_SOURCES
    src/mc-range-encoding.c
    src/mc-rangeopts.c
    src/mc-reader.c
+   src/mc-text-search-str-encode.c
    src/mc-tokens.c
    src/mc-writer.c
    src/mongocrypt-binary.c
@@ -474,6 +475,7 @@ set (TEST_MONGOCRYPT_SOURCES
    test/test-mc-range-mincover.c
    test/test-mc-rangeopts.c
    test/test-mc-reader.c
+   test/test-mc-text-search-str-encode.c
    test/test-mc-tokens.c
    test/test-mc-range-encoding.c
    test/test-mc-writer.c

@@ -119,6 +119,58 @@ bool mc_FLE2RangeInsertSpec_parse(mc_FLE2RangeInsertSpec_t *out,
                                   bool use_range_v2,
                                   mongocrypt_status_t *status);
 
+typedef struct {
+    // mlen is the max string length that can be indexed.
+    uint32_t mlen;
+    // lb is the lower bound on the length of substrings to be indexed.
+    uint32_t lb;
+    // ub is the upper bound on the length of substrings to be indexed.
+    uint32_t ub;
+} mc_FLE2SubstringInsertSpec_t;
+
+typedef struct {
+    // lb is the lower bound on the length of suffixes to be indexed.
+    uint32_t lb;
+    // ub is the upper bound on the length of suffixes to be indexed.
+    uint32_t ub;
+} mc_FLE2SuffixInsertSpec_t;
+
+typedef struct {
+    // lb is the lower bound on the length of prefixes to be indexed.
+    uint32_t lb;
+    // ub is the upper bound on the length of prefixes to be indexed.
+    uint32_t ub;
+} mc_FLE2PrefixInsertSpec_t;
+
+typedef struct {
+    // v is the value to encrypt.
+    const char *v;
+    uint32_t len;
+
+    // substr is the spec for substring indexing.
+    struct {
+        mc_FLE2SubstringInsertSpec_t value;
+        bool set;
+    } substr;
+
+    // suffix is the spec for suffix indexing.
+    struct {
+        mc_FLE2SuffixInsertSpec_t value;
+        bool set;
+    } suffix;
+
+    // prefix is the spec for prefix indexing.
+    struct {
+        mc_FLE2PrefixInsertSpec_t value;
+        bool set;
+    } prefix;
+
+    // casef indicates if case folding is enabled.
+    bool casef;
+    // diacf indicates if diacritic folding is enabled.
+    bool diacf;
+} mc_FLE2TextSearchInsertSpec_t;
+
 /** FLE2EncryptionPlaceholder implements Encryption BinData (subtype 6)
  * sub-subtype 0, the intent-to-encrypt mapping. Contains a value to encrypt and
  * a description of how it should be encrypted.

@@ -0,0 +1,60 @@
+/*
+ * Copyright 2024-present MongoDB, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H
+#define MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H
+
+#include "mc-fle2-encryption-placeholder-private.h"
+#include "mongocrypt-status-private.h"
+
+// Set of substrings of a shared base string.
+typedef struct _mc_substring_set_t mc_substring_set_t;
+
+// Iterator on substring_set.
+typedef struct {
+    mc_substring_set_t *set;
+    uint32_t cur_idx;
+} mc_substring_set_iter_t;
+
+// Point the iterator to the first substring of the given set.
+void mc_substring_set_iter_init(mc_substring_set_iter_t *it, mc_substring_set_t *set);
+
+// Get the next substring, its length, and its count. Returns false if the set does not have a next element, true
+// otherwise.
+bool mc_substring_set_iter_next(mc_substring_set_iter_t *it, const char **str, uint32_t *len, uint32_t *count);
+
+// Result of a StrEncode. Contains the computed prefix, suffix, and substring trees, or NULL if empty, as well as the
+// exact string.
+typedef struct {
+    // Owned
+    char *base_string;
+    size_t base_len;
+    mc_substring_set_t *suffix_set;
+    mc_substring_set_t *prefix_set;
+    mc_substring_set_t *substring_set;
+    char *exact;
+    size_t exact_len;
+} mc_str_encode_sets_t;
+
+// Run StrEncode with the given spec.
+mc_str_encode_sets_t mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec);
+
+// TODO MONGOCRYPT-759 This helper only exists to test folded_len != unfolded_len; make the test actually use folding
+mc_str_encode_sets_t mc_text_search_str_encode_helper(const mc_FLE2TextSearchInsertSpec_t *spec, uint32_t unfolded_len);
+
+void mc_str_encode_sets_destroy(mc_str_encode_sets_t *sets);
+
+#endif /* MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H */
@@ -0,0 +1,250 @@
+/*
+ * Copyright 2024-present MongoDB, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mc-text-search-str-encode-private.h"
+#include <bson/bson.h>
+
+struct _mc_substring_set_t {
+    // base_string is not owned
+    const char *base_string;
+    uint32_t base_string_len;
+    uint32_t *start_indices;
+    uint32_t *end_indices;
+    // Store counts per substring. As we expect heavy duplication of the padding value, this will save some time when we
+    // hash later.
+    uint32_t *substring_counts;
+    uint32_t n_indices;
+};
+
+mc_substring_set_t *mc_substring_set_new(const char *base_string, uint32_t base_len, uint32_t n_indices) {
+    mc_substring_set_t *set = (mc_substring_set_t *)bson_malloc0(sizeof(mc_substring_set_t));
+    set->base_string = base_string;
+    set->base_string_len = base_len;
+    set->start_indices = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices);
+    set->end_indices = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices);
+    set->substring_counts = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices);
+    set->n_indices = n_indices;
+    return set;
+}
+
+void mc_substring_set_destroy(mc_substring_set_t *set) {
+    if (set == NULL) {
+        return;
+    }
+    bson_free(set->start_indices);
+    bson_free(set->end_indices);
+    bson_free(set->substring_counts);
+    bson_free(set);
+}
+
+bool mc_substring_set_insert(mc_substring_set_t *set,
+                             uint32_t base_start_idx,
+                             uint32_t base_end_idx,
+                             uint32_t idx,
+                             uint32_t count) {
+    if (base_start_idx > base_end_idx || base_end_idx > set->base_string_len || idx >= set->n_indices || count == 0) {
+        return false;
+    }
+    set->start_indices[idx] = base_start_idx;
+    set->end_indices[idx] = base_end_idx;
+    set->substring_counts[idx] = count;
+    return true;
+}
+
+void mc_substring_set_iter_init(mc_substring_set_iter_t *it, mc_substring_set_t *set) {
+    it->set = set;
+    it->cur_idx = 0;
+}
+
+bool mc_substring_set_iter_next(mc_substring_set_iter_t *it, const char **str, uint32_t *len, uint32_t *count) {
+    if (it->cur_idx >= it->set->n_indices) {
+        return false;
+    }
+    uint32_t idx = it->cur_idx++;
+    if (str == NULL) {
+        // If out parameters are NULL, just increment cur_idx.
+        return true;
+    }
+    uint32_t start_idx = it->set->start_indices[idx];
+    uint32_t end_idx = it->set->end_indices[idx];
+    *str = &it->set->base_string[start_idx];
+    *len = end_idx - start_idx;
+    *count = it->set->substring_counts[idx];
+    return true;
+}
+
+// Note -- these are pre-defined only on POSIX systems.
+#undef MIN
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+#define BAD_CHAR ((char)0xFF)
+
+static mc_substring_set_t *generate_prefix_or_suffix_tree(const char *base_str,
+                                                          uint32_t folded_len,
+                                                          uint32_t unfolded_len,
+                                                          uint32_t lb,
+                                                          uint32_t ub,
+                                                          bool is_prefix) {
+    // 16 * ceil(unfolded len / 16)
+    uint32_t cbclen = 16 * (uint32_t)((unfolded_len + 15) / 16);
+    if (cbclen < lb) {
+        // No valid substrings, return empty tree
+        return NULL;
+    }
+
+    // Total number of substrings
+    uint32_t msize = MIN(cbclen, ub) - lb + 1;
+    uint32_t real_max_len = MIN(folded_len, ub);
+    // Number of actual substrings, excluding padding
+    uint32_t real_substrings = real_max_len >= lb ? real_max_len - lb + 1 : 0;
+    // If real_substrings and msize differ, we need to insert padding, so allocate one extra slot.
+    mc_substring_set_t *set = mc_substring_set_new(base_str,
+                                                   folded_len + 1,
+                                                   real_substrings == msize ? real_substrings : real_substrings + 1);
+    uint32_t idx = 0;
+    for (uint32_t i = lb; i < real_max_len + 1; i++) {
+        if (is_prefix) {
+            // [0, lb), [0, lb + 1), ..., [0, min(len, ub))
+            BSON_ASSERT(mc_substring_set_insert(set, 0, i, idx++, 1));
+        } else {
+            // [len - lb, len), [len - lb - 1, len), ..., [max(0, len - ub), len)
+            BSON_ASSERT(mc_substring_set_insert(set, folded_len - i, folded_len, idx++, 1));
+        }
+    }
+    if (msize != real_substrings) {
+        // Insert padding to get to msize
+        mc_substring_set_insert(set, 0, folded_len + 1, idx++, msize - real_substrings);
+    }
+    BSON_ASSERT(idx == set->n_indices);
+    return set;
+}
+
+static mc_substring_set_t *generate_suffix_tree(const char *base_str,
+                                                uint32_t folded_len,
+                                                uint32_t unfolded_len,
+                                                const mc_FLE2SuffixInsertSpec_t *spec) {
+    return generate_prefix_or_suffix_tree(base_str, folded_len, unfolded_len, spec->lb, spec->ub, false);
+}
+
+static mc_substring_set_t *generate_prefix_tree(const char *base_str,
+                                                uint32_t folded_len,
+                                                uint32_t unfolded_len,
+                                                const mc_FLE2PrefixInsertSpec_t *spec) {
+    return generate_prefix_or_suffix_tree(base_str, folded_len, unfolded_len, spec->lb, spec->ub, true);
+}
+
+static uint32_t calc_number_of_substrings(uint32_t strlen, uint32_t lb, uint32_t ub) {
+    // There are len - i + 1 substrings of length i in a length len string.
+    // Therefore, the total number of substrings with length between lb and ub
+    // is the sum of the integers inclusive between A = len - ub + 1 and B = len - lb + 1,
+    // A <= B. This has a closed form: (A + B)(B - A + 1)/2.
+    if (lb > strlen) {
+        return 0;
+    }
+    uint32_t largest_substr = MIN(strlen, ub);
+    uint32_t largest_substr_count = strlen - largest_substr + 1;
+    uint32_t smallest_substr_count = strlen - lb + 1;
+    return (largest_substr_count + smallest_substr_count) * (smallest_substr_count - largest_substr_count + 1) / 2;
+}
+
+static mc_substring_set_t *generate_substring_tree(const char *base_str,
+                                                   uint32_t folded_len,
+                                                   uint32_t unfolded_len,
+                                                   const mc_FLE2SubstringInsertSpec_t *spec) {
+    // 16 * ceil(unfolded len / 16)
+    uint32_t cbclen = 16 * (uint32_t)((unfolded_len + 15) / 16);
+    if (unfolded_len > spec->mlen || cbclen < spec->lb) {
+        // No valid substrings, return empty tree
+        return NULL;
+    }
+    // If mlen < cbclen, we only need to pad to mlen
+    uint32_t padded_len = MIN(spec->mlen, cbclen);
+    // Total number of substrings -- i.e. the number of valid substrings IF the string spanned the full padded length
+    uint32_t msize = calc_number_of_substrings(padded_len, spec->lb, spec->ub);
+    uint32_t n_real_substrings = calc_number_of_substrings(folded_len, spec->lb, spec->ub);
+    // If real_substrings and msize differ, we need to insert padding, so allocate one extra slot.
+    mc_substring_set_t *set =
+        mc_substring_set_new(base_str,
+                             folded_len + 1,
+                             n_real_substrings == msize ? n_real_substrings : n_real_substrings + 1);
+    uint32_t idx = 0;
+    // If folded_len < LB, there are no real substrings, so we can skip (avoiding underflow via folded_len - LB)
+    if (folded_len >= spec->lb) {
+        for (uint32_t i = 0; i < folded_len - spec->lb + 1; i++) {
+            for (uint32_t j = i + spec->lb; j < MIN(folded_len, i + spec->ub) + 1; j++) {
+                mc_substring_set_insert(set, i, j, idx++, 1);
+            }
+        }
+    }
+    if (msize != n_real_substrings) {
+        BSON_ASSERT(msize > n_real_substrings);
+        mc_substring_set_insert(set, 0, folded_len + 1, idx++, msize - n_real_substrings);
+    }
+    BSON_ASSERT(idx == set->n_indices);
+    return set;
+}
+
+// Base string = string + 0xFF. All substrings, including padding, can be represented as a view on this.
+static char *make_base_string_for_str_encode(const char *folded_str, uint32_t folded_len) {
+    char *ret = (char *)bson_malloc0(folded_len + 1);
+    memcpy(ret, folded_str, folded_len);
+    ret[folded_len] = BAD_CHAR;
+    return ret;
+}
+
+// TODO MONGOCRYPT-759 This helper only exists to test folded_len != unfolded_len; make the test actually use folding
+mc_str_encode_sets_t mc_text_search_str_encode_helper(const mc_FLE2TextSearchInsertSpec_t *spec,
+                                                      uint32_t unfolded_len) {
+    const char *folded_str = spec->v;
+    uint32_t folded_len = spec->len;
+
+    mc_str_encode_sets_t sets;
+    sets.suffix_set = NULL;
+    sets.prefix_set = NULL;
+    sets.substring_set = NULL;
+    // Base string is the folded string plus the 0xFF character
+    sets.base_string = make_base_string_for_str_encode(folded_str, folded_len);
+    sets.base_len = spec->len + 1;
+    if (spec->suffix.set) {
+        sets.suffix_set = generate_suffix_tree(sets.base_string, folded_len, unfolded_len, &spec->suffix.value);
+    }
+    if (spec->prefix.set) {
+        sets.prefix_set = generate_prefix_tree(sets.base_string, folded_len, unfolded_len, &spec->prefix.value);
+    }
+    if (spec->substr.set) {
+        sets.substring_set = generate_substring_tree(sets.base_string, folded_len, unfolded_len, &spec->substr.value);
+    }
+    // Exact string is always the first len characters of the base string
+    sets.exact = sets.base_string;
+    sets.exact_len = spec->len;
+    return sets;
+}
+
+mc_str_encode_sets_t mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec) {
+    // TODO MONGOCRYPT-759 Implement and use CFold
+    uint32_t unfolded_len = spec->len;
+    return mc_text_search_str_encode_helper(spec, unfolded_len);
+}
+
+void mc_str_encode_sets_destroy(mc_str_encode_sets_t *sets) {
+    if (sets == NULL) {
+        return;
+    }
+    bson_free(sets->base_string);
+    mc_substring_set_destroy(sets->suffix_set);
+    mc_substring_set_destroy(sets->prefix_set);
+    mc_substring_set_destroy(sets->substring_set);
+}