Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ set (MONGOCRYPT_SOURCES
src/mc-range-encoding.c
src/mc-rangeopts.c
src/mc-reader.c
src/mc-text-search-str-encode.c
src/mc-tokens.c
src/mc-writer.c
src/mongocrypt-binary.c
Expand Down Expand Up @@ -474,6 +475,7 @@ set (TEST_MONGOCRYPT_SOURCES
test/test-mc-range-mincover.c
test/test-mc-rangeopts.c
test/test-mc-reader.c
test/test-mc-text-search-str-encode.c
test/test-mc-tokens.c
test/test-mc-range-encoding.c
test/test-mc-writer.c
Expand Down
52 changes: 52 additions & 0 deletions src/mc-fle2-encryption-placeholder-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,58 @@ bool mc_FLE2RangeInsertSpec_parse(mc_FLE2RangeInsertSpec_t *out,
bool use_range_v2,
mongocrypt_status_t *status);

typedef struct {
// mlen is the max string length that can be indexed.
uint32_t mlen;
// lb is the lower bound on the length of substrings to be indexed.
uint32_t lb;
// ub is the upper bound on the length of substrings to be indexed.
uint32_t ub;
} mc_FLE2SubstringInsertSpec_t;

typedef struct {
// lb is the lower bound on the length of suffixes to be indexed.
uint32_t lb;
// ub is the upper bound on the length of suffixes to be indexed.
uint32_t ub;
} mc_FLE2SuffixInsertSpec_t;

typedef struct {
// lb is the lower bound on the length of prefixes to be indexed.
uint32_t lb;
// ub is the upper bound on the length of prefixes to be indexed.
uint32_t ub;
} mc_FLE2PrefixInsertSpec_t;

typedef struct {
// v is the value to encrypt.
const char *v;
uint32_t len;

// substr is the spec for substring indexing.
struct {
mc_FLE2SubstringInsertSpec_t value;
bool set;
} substr;

// suffix is the spec for suffix indexing.
struct {
mc_FLE2SuffixInsertSpec_t value;
bool set;
} suffix;

// prefix is the spec for prefix indexing.
struct {
mc_FLE2PrefixInsertSpec_t value;
bool set;
} prefix;

// casef indicates if case folding is enabled.
bool casef;
// diacf indicates if diacritic folding is enabled.
bool diacf;
} mc_FLE2TextSearchInsertSpec_t;

/** FLE2EncryptionPlaceholder implements Encryption BinData (subtype 6)
* sub-subtype 0, the intent-to-encrypt mapping. Contains a value to encrypt and
* a description of how it should be encrypted.
Expand Down
60 changes: 60 additions & 0 deletions src/mc-text-search-str-encode-private.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Copyright 2024-present MongoDB, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H
#define MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H

#include "mc-fle2-encryption-placeholder-private.h"
#include "mongocrypt-status-private.h"

// Set of substrings of a shared base string.
typedef struct _mc_substring_set_t mc_substring_set_t;

// Iterator on substring_set.
typedef struct {
mc_substring_set_t *set;
uint32_t cur_idx;
} mc_substring_set_iter_t;

// Point the iterator to the first substring of the given set.
void mc_substring_set_iter_init(mc_substring_set_iter_t *it, mc_substring_set_t *set);

// Get the next substring, its length, and its count. Returns false if the set does not have a next element, true
// otherwise.
bool mc_substring_set_iter_next(mc_substring_set_iter_t *it, const char **str, uint32_t *len, uint32_t *count);

// Result of a StrEncode. Contains the computed prefix, suffix, and substring trees, or NULL if empty, as well as the
// exact string.
typedef struct {
// Owned
char *base_string;
size_t base_len;
mc_substring_set_t *suffix_set;
mc_substring_set_t *prefix_set;
mc_substring_set_t *substring_set;
char *exact;
size_t exact_len;
} mc_str_encode_sets_t;

// Run StrEncode with the given spec.
mc_str_encode_sets_t mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec);

// TODO MONGOCRYPT-759 This helper only exists to test folded_len != unfolded_len; make the test actually use folding
mc_str_encode_sets_t mc_text_search_str_encode_helper(const mc_FLE2TextSearchInsertSpec_t *spec, uint32_t unfolded_len);

void mc_str_encode_sets_destroy(mc_str_encode_sets_t *sets);

#endif /* MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H */
250 changes: 250 additions & 0 deletions src/mc-text-search-str-encode.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
/*
* Copyright 2024-present MongoDB, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "mc-text-search-str-encode-private.h"
#include <bson/bson.h>

struct _mc_substring_set_t {
// base_string is not owned
const char *base_string;
uint32_t base_string_len;
uint32_t *start_indices;
uint32_t *end_indices;
// Store counts per substring. As we expect heavy duplication of the padding value, this will save some time when we
// hash later.
uint32_t *substring_counts;
uint32_t n_indices;
};

mc_substring_set_t *mc_substring_set_new(const char *base_string, uint32_t base_len, uint32_t n_indices) {
mc_substring_set_t *set = (mc_substring_set_t *)bson_malloc0(sizeof(mc_substring_set_t));
set->base_string = base_string;
set->base_string_len = base_len;
set->start_indices = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices);
set->end_indices = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices);
set->substring_counts = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices);
set->n_indices = n_indices;
return set;
}

void mc_substring_set_destroy(mc_substring_set_t *set) {
if (set == NULL) {
return;
}
bson_free(set->start_indices);
bson_free(set->end_indices);
bson_free(set->substring_counts);
bson_free(set);
}

bool mc_substring_set_insert(mc_substring_set_t *set,
uint32_t base_start_idx,
uint32_t base_end_idx,
uint32_t idx,
uint32_t count) {
if (base_start_idx > base_end_idx || base_end_idx > set->base_string_len || idx >= set->n_indices || count == 0) {
return false;
}
set->start_indices[idx] = base_start_idx;
set->end_indices[idx] = base_end_idx;
set->substring_counts[idx] = count;
return true;
}

void mc_substring_set_iter_init(mc_substring_set_iter_t *it, mc_substring_set_t *set) {
it->set = set;
it->cur_idx = 0;
}

bool mc_substring_set_iter_next(mc_substring_set_iter_t *it, const char **str, uint32_t *len, uint32_t *count) {
if (it->cur_idx >= it->set->n_indices) {
return false;
}
uint32_t idx = it->cur_idx++;
if (str == NULL) {
// If out parameters are NULL, just increment cur_idx.
return true;
}
uint32_t start_idx = it->set->start_indices[idx];
uint32_t end_idx = it->set->end_indices[idx];
*str = &it->set->base_string[start_idx];
*len = end_idx - start_idx;
*count = it->set->substring_counts[idx];
return true;
}

// Note -- these are pre-defined only on POSIX systems.
#undef MIN
#define MIN(a, b) (((a) < (b)) ? (a) : (b))

#define BAD_CHAR ((char)0xFF)

static mc_substring_set_t *generate_prefix_or_suffix_tree(const char *base_str,
uint32_t folded_len,
uint32_t unfolded_len,
uint32_t lb,
uint32_t ub,
bool is_prefix) {
// 16 * ceil(unfolded len / 16)
uint32_t cbclen = 16 * (uint32_t)((unfolded_len + 15) / 16);
if (cbclen < lb) {
// No valid substrings, return empty tree
return NULL;
}

// Total number of substrings
uint32_t msize = MIN(cbclen, ub) - lb + 1;
uint32_t real_max_len = MIN(folded_len, ub);
// Number of actual substrings, excluding padding
uint32_t real_substrings = real_max_len >= lb ? real_max_len - lb + 1 : 0;
// If real_substrings and msize differ, we need to insert padding, so allocate one extra slot.
mc_substring_set_t *set = mc_substring_set_new(base_str,
folded_len + 1,
real_substrings == msize ? real_substrings : real_substrings + 1);
uint32_t idx = 0;
for (uint32_t i = lb; i < real_max_len + 1; i++) {
if (is_prefix) {
// [0, lb), [0, lb + 1), ..., [0, min(len, ub))
BSON_ASSERT(mc_substring_set_insert(set, 0, i, idx++, 1));
} else {
// [len - lb, len), [len - lb - 1, len), ..., [max(0, len - ub), len)
BSON_ASSERT(mc_substring_set_insert(set, folded_len - i, folded_len, idx++, 1));
}
}
if (msize != real_substrings) {
// Insert padding to get to msize
mc_substring_set_insert(set, 0, folded_len + 1, idx++, msize - real_substrings);
}
BSON_ASSERT(idx == set->n_indices);
return set;
}

static mc_substring_set_t *generate_suffix_tree(const char *base_str,
uint32_t folded_len,
uint32_t unfolded_len,
const mc_FLE2SuffixInsertSpec_t *spec) {
return generate_prefix_or_suffix_tree(base_str, folded_len, unfolded_len, spec->lb, spec->ub, false);
}

static mc_substring_set_t *generate_prefix_tree(const char *base_str,
uint32_t folded_len,
uint32_t unfolded_len,
const mc_FLE2PrefixInsertSpec_t *spec) {
return generate_prefix_or_suffix_tree(base_str, folded_len, unfolded_len, spec->lb, spec->ub, true);
}

static uint32_t calc_number_of_substrings(uint32_t strlen, uint32_t lb, uint32_t ub) {
// There are len - i + 1 substrings of length i in a length len string.
// Therefore, the total number of substrings with length between lb and ub
// is the sum of the integers inclusive between A = len - ub + 1 and B = len - lb + 1,
// A <= B. This has a closed form: (A + B)(B - A + 1)/2.
if (lb > strlen) {
return 0;
}
uint32_t largest_substr = MIN(strlen, ub);
uint32_t largest_substr_count = strlen - largest_substr + 1;
uint32_t smallest_substr_count = strlen - lb + 1;
return (largest_substr_count + smallest_substr_count) * (smallest_substr_count - largest_substr_count + 1) / 2;
}

static mc_substring_set_t *generate_substring_tree(const char *base_str,
uint32_t folded_len,
uint32_t unfolded_len,
const mc_FLE2SubstringInsertSpec_t *spec) {
// 16 * ceil(unfolded len / 16)
uint32_t cbclen = 16 * (uint32_t)((unfolded_len + 15) / 16);
if (unfolded_len > spec->mlen || cbclen < spec->lb) {
// No valid substrings, return empty tree
return NULL;
}
// If mlen < cbclen, we only need to pad to mlen
uint32_t padded_len = MIN(spec->mlen, cbclen);
// Total number of substrings -- i.e. the number of valid substrings IF the string spanned the full padded length
uint32_t msize = calc_number_of_substrings(padded_len, spec->lb, spec->ub);
uint32_t n_real_substrings = calc_number_of_substrings(folded_len, spec->lb, spec->ub);
// If real_substrings and msize differ, we need to insert padding, so allocate one extra slot.
mc_substring_set_t *set =
mc_substring_set_new(base_str,
folded_len + 1,
n_real_substrings == msize ? n_real_substrings : n_real_substrings + 1);
uint32_t idx = 0;
// If folded_len < LB, there are no real substrings, so we can skip (avoiding underflow via folded_len - LB)
if (folded_len >= spec->lb) {
for (uint32_t i = 0; i < folded_len - spec->lb + 1; i++) {
for (uint32_t j = i + spec->lb; j < MIN(folded_len, i + spec->ub) + 1; j++) {
mc_substring_set_insert(set, i, j, idx++, 1);
}
}
}
if (msize != n_real_substrings) {
BSON_ASSERT(msize > n_real_substrings);
mc_substring_set_insert(set, 0, folded_len + 1, idx++, msize - n_real_substrings);
}
BSON_ASSERT(idx == set->n_indices);
return set;
}

// Base string = string + 0xFF. All substrings, including padding, can be represented as a view on this.
static char *make_base_string_for_str_encode(const char *folded_str, uint32_t folded_len) {
char *ret = (char *)bson_malloc0(folded_len + 1);
memcpy(ret, folded_str, folded_len);
ret[folded_len] = BAD_CHAR;
return ret;
}

// TODO MONGOCRYPT-759 This helper only exists to test folded_len != unfolded_len; make the test actually use folding
mc_str_encode_sets_t mc_text_search_str_encode_helper(const mc_FLE2TextSearchInsertSpec_t *spec,
uint32_t unfolded_len) {
const char *folded_str = spec->v;
uint32_t folded_len = spec->len;

mc_str_encode_sets_t sets;
sets.suffix_set = NULL;
sets.prefix_set = NULL;
sets.substring_set = NULL;
// Base string is the folded string plus the 0xFF character
sets.base_string = make_base_string_for_str_encode(folded_str, folded_len);
sets.base_len = spec->len + 1;
if (spec->suffix.set) {
sets.suffix_set = generate_suffix_tree(sets.base_string, folded_len, unfolded_len, &spec->suffix.value);
}
if (spec->prefix.set) {
sets.prefix_set = generate_prefix_tree(sets.base_string, folded_len, unfolded_len, &spec->prefix.value);
}
if (spec->substr.set) {
sets.substring_set = generate_substring_tree(sets.base_string, folded_len, unfolded_len, &spec->substr.value);
}
// Exact string is always the first len characters of the base string
sets.exact = sets.base_string;
sets.exact_len = spec->len;
return sets;
}

mc_str_encode_sets_t mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec) {
// TODO MONGOCRYPT-759 Implement and use CFold
uint32_t unfolded_len = spec->len;
return mc_text_search_str_encode_helper(spec, unfolded_len);
}

void mc_str_encode_sets_destroy(mc_str_encode_sets_t *sets) {
if (sets == NULL) {
return;
}
bson_free(sets->base_string);
mc_substring_set_destroy(sets->suffix_set);
mc_substring_set_destroy(sets->prefix_set);
mc_substring_set_destroy(sets->substring_set);
}
Loading
Loading