-
Notifications
You must be signed in to change notification settings - Fork 99
MONGOCRYPT-755 Implement StrEncode #928
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 11 commits
Commits
Show all changes
25 commits
Select commit
Hold shift + click to select a range
70e2ef4
MONGOCRYPT-755 Implement StrEncode
marksg07 fe6f93b
Comments + cleanup
marksg07 c8678c8
more comments
marksg07 5215b80
fix
marksg07 e5e8c58
fix ff
marksg07 92bfeb0
fix
marksg07 ceacd48
f
marksg07 cbd420d
windows
marksg07 54f6815
ll
marksg07 481f378
lld
marksg07 85a12ba
Merge branch 'master' into marksg07/mongocrypt-755
marksg07 723427d
unicode
marksg07 0286858
comment
marksg07 b0c023f
comments
marksg07 cb6bcf2
const
marksg07 10792c2
windows
marksg07 4bcba8a
Hashset
marksg07 3e0301e
PR fixes
marksg07 dad5688
fix bug
marksg07 48f80c1
a
marksg07 59e5944
more leaks
marksg07 67b5d07
Merge branch 'master' into marksg07/mongocrypt-755
marksg07 d8f11cb
Fixes
marksg07 95786df
Merge branch 'master' into marksg07/mongocrypt-755
marksg07 b75e949
pr
marksg07 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
/* | ||
* Copyright 2024-present MongoDB, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#ifndef MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H | ||
#define MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H | ||
|
||
#include "mc-fle2-encryption-placeholder-private.h" | ||
#include "mongocrypt-status-private.h" | ||
|
||
// Set of substrings of a shared base string. | ||
typedef struct _mc_substring_set_t mc_substring_set_t; | ||
|
||
// Iterator on substring_set. | ||
typedef struct { | ||
mc_substring_set_t *set; | ||
uint32_t cur_idx; | ||
} mc_substring_set_iter_t; | ||
|
||
// Point the iterator to the first substring of the given set. | ||
void mc_substring_set_iter_init(mc_substring_set_iter_t *it, mc_substring_set_t *set); | ||
|
||
// Get the next substring, its length, and its count. Returns false if the set does not have a next element, true | ||
// otherwise. | ||
bool mc_substring_set_iter_next(mc_substring_set_iter_t *it, const char **str, uint32_t *len, uint32_t *count); | ||
|
||
// Result of a StrEncode. Contains the computed prefix, suffix, and substring trees, or NULL if empty, as well as the | ||
// exact string. | ||
typedef struct { | ||
// Owned | ||
char *base_string; | ||
size_t base_len; | ||
mc_substring_set_t *suffix_set; | ||
mc_substring_set_t *prefix_set; | ||
mc_substring_set_t *substring_set; | ||
char *exact; | ||
size_t exact_len; | ||
erwee marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
} mc_str_encode_sets_t; | ||
erwee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// Run StrEncode with the given spec. | ||
mc_str_encode_sets_t mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec); | ||
|
||
// TODO MONGOCRYPT-759 This helper only exists to test folded_len != unfolded_len; make the test actually use folding | ||
mc_str_encode_sets_t mc_text_search_str_encode_helper(const mc_FLE2TextSearchInsertSpec_t *spec, uint32_t unfolded_len); | ||
|
||
void mc_str_encode_sets_destroy(mc_str_encode_sets_t *sets); | ||
|
||
#endif /* MONGOCRYPT_TEXT_SEARCH_STR_ENCODE_PRIVATE_H */ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
/* | ||
* Copyright 2024-present MongoDB, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "mc-text-search-str-encode-private.h" | ||
#include <bson/bson.h> | ||
|
||
struct _mc_substring_set_t { | ||
// base_string is not owned | ||
const char *base_string; | ||
uint32_t base_string_len; | ||
uint32_t *start_indices; | ||
uint32_t *end_indices; | ||
// Store counts per substring. As we expect heavy duplication of the padding value, this will save some time when we | ||
// hash later. | ||
uint32_t *substring_counts; | ||
uint32_t n_indices; | ||
}; | ||
|
||
mc_substring_set_t *mc_substring_set_new(const char *base_string, uint32_t base_len, uint32_t n_indices) { | ||
mc_substring_set_t *set = (mc_substring_set_t *)bson_malloc0(sizeof(mc_substring_set_t)); | ||
set->base_string = base_string; | ||
set->base_string_len = base_len; | ||
set->start_indices = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices); | ||
set->end_indices = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices); | ||
set->substring_counts = (uint32_t *)bson_malloc0(sizeof(uint32_t) * n_indices); | ||
set->n_indices = n_indices; | ||
return set; | ||
} | ||
|
||
void mc_substring_set_destroy(mc_substring_set_t *set) { | ||
if (set == NULL) { | ||
return; | ||
} | ||
bson_free(set->start_indices); | ||
bson_free(set->end_indices); | ||
bson_free(set->substring_counts); | ||
bson_free(set); | ||
} | ||
|
||
bool mc_substring_set_insert(mc_substring_set_t *set, | ||
uint32_t base_start_idx, | ||
uint32_t base_end_idx, | ||
uint32_t idx, | ||
uint32_t count) { | ||
if (base_start_idx > base_end_idx || base_end_idx > set->base_string_len || idx >= set->n_indices || count == 0) { | ||
return false; | ||
} | ||
set->start_indices[idx] = base_start_idx; | ||
set->end_indices[idx] = base_end_idx; | ||
set->substring_counts[idx] = count; | ||
return true; | ||
} | ||
|
||
void mc_substring_set_iter_init(mc_substring_set_iter_t *it, mc_substring_set_t *set) { | ||
it->set = set; | ||
it->cur_idx = 0; | ||
} | ||
|
||
bool mc_substring_set_iter_next(mc_substring_set_iter_t *it, const char **str, uint32_t *len, uint32_t *count) { | ||
if (it->cur_idx >= it->set->n_indices) { | ||
return false; | ||
} | ||
uint32_t idx = it->cur_idx++; | ||
if (str == NULL) { | ||
// If out parameters are NULL, just increment cur_idx. | ||
return true; | ||
} | ||
uint32_t start_idx = it->set->start_indices[idx]; | ||
uint32_t end_idx = it->set->end_indices[idx]; | ||
*str = &it->set->base_string[start_idx]; | ||
*len = end_idx - start_idx; | ||
*count = it->set->substring_counts[idx]; | ||
return true; | ||
} | ||
|
||
// Note -- these are pre-defined only on POSIX systems. | ||
#undef MIN | ||
#define MIN(a, b) (((a) < (b)) ? (a) : (b)) | ||
erwee marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
#define BAD_CHAR ((char)0xFF) | ||
|
||
static mc_substring_set_t *generate_prefix_or_suffix_tree(const char *base_str, | ||
uint32_t folded_len, | ||
uint32_t unfolded_len, | ||
uint32_t lb, | ||
uint32_t ub, | ||
bool is_prefix) { | ||
// 16 * ceil(unfolded len / 16) | ||
uint32_t cbclen = 16 * (uint32_t)((unfolded_len + 15) / 16); | ||
if (cbclen < lb) { | ||
// No valid substrings, return empty tree | ||
return NULL; | ||
} | ||
|
||
// Total number of substrings | ||
uint32_t msize = MIN(cbclen, ub) - lb + 1; | ||
uint32_t real_max_len = MIN(folded_len, ub); | ||
// Number of actual substrings, excluding padding | ||
uint32_t real_substrings = real_max_len >= lb ? real_max_len - lb + 1 : 0; | ||
// If real_substrings and msize differ, we need to insert padding, so allocate one extra slot. | ||
mc_substring_set_t *set = mc_substring_set_new(base_str, | ||
folded_len + 1, | ||
real_substrings == msize ? real_substrings : real_substrings + 1); | ||
uint32_t idx = 0; | ||
for (uint32_t i = lb; i < real_max_len + 1; i++) { | ||
if (is_prefix) { | ||
// [0, lb), [0, lb + 1), ..., [0, min(len, ub)) | ||
BSON_ASSERT(mc_substring_set_insert(set, 0, i, idx++, 1)); | ||
} else { | ||
// [len - lb, len), [len - lb - 1, len), ..., [max(0, len - ub), len) | ||
BSON_ASSERT(mc_substring_set_insert(set, folded_len - i, folded_len, idx++, 1)); | ||
} | ||
} | ||
if (msize != real_substrings) { | ||
// Insert padding to get to msize | ||
mc_substring_set_insert(set, 0, folded_len + 1, idx++, msize - real_substrings); | ||
} | ||
BSON_ASSERT(idx == set->n_indices); | ||
return set; | ||
} | ||
|
||
static mc_substring_set_t *generate_suffix_tree(const char *base_str, | ||
uint32_t folded_len, | ||
uint32_t unfolded_len, | ||
const mc_FLE2SuffixInsertSpec_t *spec) { | ||
return generate_prefix_or_suffix_tree(base_str, folded_len, unfolded_len, spec->lb, spec->ub, false); | ||
} | ||
|
||
static mc_substring_set_t *generate_prefix_tree(const char *base_str, | ||
uint32_t folded_len, | ||
uint32_t unfolded_len, | ||
const mc_FLE2PrefixInsertSpec_t *spec) { | ||
return generate_prefix_or_suffix_tree(base_str, folded_len, unfolded_len, spec->lb, spec->ub, true); | ||
} | ||
|
||
static uint32_t calc_number_of_substrings(uint32_t strlen, uint32_t lb, uint32_t ub) { | ||
// There are len - i + 1 substrings of length i in a length len string. | ||
// Therefore, the total number of substrings with length between lb and ub | ||
// is the sum of the integers inclusive between A = len - ub + 1 and B = len - lb + 1, | ||
// A <= B. This has a closed form: (A + B)(B - A + 1)/2. | ||
if (lb > strlen) { | ||
return 0; | ||
} | ||
uint32_t largest_substr = MIN(strlen, ub); | ||
uint32_t largest_substr_count = strlen - largest_substr + 1; | ||
uint32_t smallest_substr_count = strlen - lb + 1; | ||
return (largest_substr_count + smallest_substr_count) * (smallest_substr_count - largest_substr_count + 1) / 2; | ||
} | ||
|
||
static mc_substring_set_t *generate_substring_tree(const char *base_str, | ||
uint32_t folded_len, | ||
uint32_t unfolded_len, | ||
const mc_FLE2SubstringInsertSpec_t *spec) { | ||
erwee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// 16 * ceil(unfolded len / 16) | ||
uint32_t cbclen = 16 * (uint32_t)((unfolded_len + 15) / 16); | ||
if (unfolded_len > spec->mlen || cbclen < spec->lb) { | ||
// No valid substrings, return empty tree | ||
return NULL; | ||
} | ||
// If mlen < cbclen, we only need to pad to mlen | ||
uint32_t padded_len = MIN(spec->mlen, cbclen); | ||
// Total number of substrings -- i.e. the number of valid substrings IF the string spanned the full padded length | ||
uint32_t msize = calc_number_of_substrings(padded_len, spec->lb, spec->ub); | ||
erwee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
uint32_t n_real_substrings = calc_number_of_substrings(folded_len, spec->lb, spec->ub); | ||
// If real_substrings and msize differ, we need to insert padding, so allocate one extra slot. | ||
mc_substring_set_t *set = | ||
mc_substring_set_new(base_str, | ||
folded_len + 1, | ||
n_real_substrings == msize ? n_real_substrings : n_real_substrings + 1); | ||
uint32_t idx = 0; | ||
// If folded_len < LB, there are no real substrings, so we can skip (avoiding underflow via folded_len - LB) | ||
if (folded_len >= spec->lb) { | ||
for (uint32_t i = 0; i < folded_len - spec->lb + 1; i++) { | ||
for (uint32_t j = i + spec->lb; j < MIN(folded_len, i + spec->ub) + 1; j++) { | ||
mc_substring_set_insert(set, i, j, idx++, 1); | ||
erwee marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
} | ||
} | ||
} | ||
if (msize != n_real_substrings) { | ||
BSON_ASSERT(msize > n_real_substrings); | ||
mc_substring_set_insert(set, 0, folded_len + 1, idx++, msize - n_real_substrings); | ||
} | ||
BSON_ASSERT(idx == set->n_indices); | ||
return set; | ||
} | ||
|
||
// Base string = string + 0xFF. All substrings, including padding, can be represented as a view on this. | ||
static char *make_base_string_for_str_encode(const char *folded_str, uint32_t folded_len) { | ||
char *ret = (char *)bson_malloc0(folded_len + 1); | ||
memcpy(ret, folded_str, folded_len); | ||
ret[folded_len] = BAD_CHAR; | ||
return ret; | ||
} | ||
|
||
// TODO MONGOCRYPT-759 This helper only exists to test folded_len != unfolded_len; make the test actually use folding | ||
mc_str_encode_sets_t mc_text_search_str_encode_helper(const mc_FLE2TextSearchInsertSpec_t *spec, | ||
uint32_t unfolded_len) { | ||
const char *folded_str = spec->v; | ||
erwee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
uint32_t folded_len = spec->len; | ||
|
||
mc_str_encode_sets_t sets; | ||
sets.suffix_set = NULL; | ||
sets.prefix_set = NULL; | ||
sets.substring_set = NULL; | ||
// Base string is the folded string plus the 0xFF character | ||
sets.base_string = make_base_string_for_str_encode(folded_str, folded_len); | ||
sets.base_len = spec->len + 1; | ||
erwee marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
if (spec->suffix.set) { | ||
sets.suffix_set = generate_suffix_tree(sets.base_string, folded_len, unfolded_len, &spec->suffix.value); | ||
} | ||
if (spec->prefix.set) { | ||
sets.prefix_set = generate_prefix_tree(sets.base_string, folded_len, unfolded_len, &spec->prefix.value); | ||
} | ||
if (spec->substr.set) { | ||
sets.substring_set = generate_substring_tree(sets.base_string, folded_len, unfolded_len, &spec->substr.value); | ||
} | ||
// Exact string is always the first len characters of the base string | ||
sets.exact = sets.base_string; | ||
sets.exact_len = spec->len; | ||
erwee marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
return sets; | ||
} | ||
|
||
mc_str_encode_sets_t mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpec_t *spec) { | ||
// TODO MONGOCRYPT-759 Implement and use CFold | ||
uint32_t unfolded_len = spec->len; | ||
erwee marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
return mc_text_search_str_encode_helper(spec, unfolded_len); | ||
} | ||
|
||
void mc_str_encode_sets_destroy(mc_str_encode_sets_t *sets) { | ||
if (sets == NULL) { | ||
return; | ||
} | ||
bson_free(sets->base_string); | ||
mc_substring_set_destroy(sets->suffix_set); | ||
mc_substring_set_destroy(sets->prefix_set); | ||
mc_substring_set_destroy(sets->substring_set); | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.