mirror of
git://jb55.com/damus
synced 2024-10-04 19:00:42 +00:00
nostrdb/search: phrase searching working
This changes the search algorithm to be much smarter and more efficient at searching phrases. It is also much simpler, using less intermediate data structures. Signed-off-by: William Casarin <jb55@jb55.com>
This commit is contained in:
parent
213a26cd01
commit
9d208284c6
@ -140,21 +140,6 @@ struct ndb_u64_tsid {
|
|||||||
uint64_t timestamp;
|
uint64_t timestamp;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define MAX_RESULTS_PER_WORD 32
|
|
||||||
#define MAX_TEXT_SEARCH_WORDS 8
|
|
||||||
|
|
||||||
// used mainly by ndb_text_search when collecting text search results
|
|
||||||
struct ndb_keyed_text_search_result {
|
|
||||||
struct ndb_text_search_key key;
|
|
||||||
uint64_t note_id;
|
|
||||||
int prefix_chars;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ndb_keyed_text_search_resultset {
|
|
||||||
struct ndb_keyed_text_search_result results[MAX_RESULTS_PER_WORD];
|
|
||||||
int num_results;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ndb_word
|
struct ndb_word
|
||||||
{
|
{
|
||||||
const char *word;
|
const char *word;
|
||||||
@ -167,29 +152,31 @@ struct ndb_search_words
|
|||||||
int num_words;
|
int num_words;
|
||||||
};
|
};
|
||||||
|
|
||||||
// used mainly by ndb_text_search when collecting text search results
|
|
||||||
struct ndb_keyed_text_search_results {
|
|
||||||
struct ndb_keyed_text_search_resultset results[MAX_TEXT_SEARCH_WORDS];
|
|
||||||
int num_words;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
// ndb_text_search_key
|
// ndb_text_search_key
|
||||||
//
|
//
|
||||||
// This is compressed when in lmdb:
|
// This is compressed when in lmdb:
|
||||||
//
|
//
|
||||||
|
// note_id: varint
|
||||||
// strlen: varint
|
// strlen: varint
|
||||||
// str: cstr
|
// str: cstr
|
||||||
// timestamp: varint
|
// timestamp: varint
|
||||||
// word_index: varint
|
// word_index: varint
|
||||||
|
//
|
||||||
static int ndb_make_text_search_key(unsigned char *buf, int bufsize,
|
static int ndb_make_text_search_key(unsigned char *buf, int bufsize,
|
||||||
int word_index, int word_len, const char *str,
|
int word_index, int word_len, const char *str,
|
||||||
uint64_t timestamp, int *keysize)
|
uint64_t timestamp, uint64_t note_id,
|
||||||
|
int *keysize)
|
||||||
{
|
{
|
||||||
struct cursor cur;
|
struct cursor cur;
|
||||||
int size, pad;
|
int size, pad;
|
||||||
make_cursor(buf, buf + bufsize, &cur);
|
make_cursor(buf, buf + bufsize, &cur);
|
||||||
|
|
||||||
|
// TODO: need update this to uint64_t
|
||||||
|
// we push this first because our query function can pull this off
|
||||||
|
// quicky to check matches
|
||||||
|
if (!push_varint(&cur, (int)note_id))
|
||||||
|
return 0;
|
||||||
|
|
||||||
// string length
|
// string length
|
||||||
if (!push_varint(&cur, word_len))
|
if (!push_varint(&cur, word_len))
|
||||||
return 0;
|
return 0;
|
||||||
@ -198,15 +185,15 @@ static int ndb_make_text_search_key(unsigned char *buf, int bufsize,
|
|||||||
if (!cursor_push_lowercase(&cur, str, word_len))
|
if (!cursor_push_lowercase(&cur, str, word_len))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
// TODO: need update this to uint64_t
|
||||||
|
if (!push_varint(&cur, (int)timestamp))
|
||||||
|
return 0;
|
||||||
|
|
||||||
// the index of the word in the content so that we can do more accurate
|
// the index of the word in the content so that we can do more accurate
|
||||||
// phrase searches
|
// phrase searches
|
||||||
if (!push_varint(&cur, word_index))
|
if (!push_varint(&cur, word_index))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
// TODO: need update this to uint64_t
|
|
||||||
if (!push_varint(&cur, (int)timestamp))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
size = cur.p - cur.start;
|
size = cur.p - cur.start;
|
||||||
|
|
||||||
// pad to 8-byte alignment
|
// pad to 8-byte alignment
|
||||||
@ -223,11 +210,35 @@ static int ndb_make_text_search_key(unsigned char *buf, int bufsize,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ndb_make_noted_text_search_key(unsigned char *buf, int bufsize,
|
||||||
|
int wordlen, const char *word,
|
||||||
|
uint64_t timestamp, uint64_t note_id,
|
||||||
|
int *keysize)
|
||||||
|
{
|
||||||
|
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
|
||||||
|
timestamp, note_id, keysize);
|
||||||
|
}
|
||||||
|
|
||||||
static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize,
|
static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize,
|
||||||
int wordlen, const char *word,
|
int wordlen, const char *word,
|
||||||
int *keysize)
|
int *keysize)
|
||||||
{
|
{
|
||||||
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word, 0, keysize);
|
uint64_t timestamp, note_id;
|
||||||
|
timestamp = 0;
|
||||||
|
note_id = 0;
|
||||||
|
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
|
||||||
|
timestamp, note_id, keysize);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ndb_make_text_search_key_high(unsigned char *buf, int bufsize,
|
||||||
|
int wordlen, const char *word,
|
||||||
|
int *keysize)
|
||||||
|
{
|
||||||
|
uint64_t timestamp, note_id;
|
||||||
|
timestamp = UINT64_MAX;
|
||||||
|
note_id = UINT64_MAX;
|
||||||
|
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
|
||||||
|
timestamp, note_id, keysize);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** From LMDB: Compare two items lexically */
|
/** From LMDB: Compare two items lexically */
|
||||||
@ -251,11 +262,16 @@ static int ndb_text_search_key_compare(const MDB_val *a, const MDB_val *b)
|
|||||||
{
|
{
|
||||||
struct cursor ca, cb;
|
struct cursor ca, cb;
|
||||||
int sa, sb;
|
int sa, sb;
|
||||||
|
int nid_a, nid_b;
|
||||||
MDB_val a2, b2;
|
MDB_val a2, b2;
|
||||||
|
|
||||||
make_cursor(a->mv_data, a->mv_data + a->mv_size, &ca);
|
make_cursor(a->mv_data, a->mv_data + a->mv_size, &ca);
|
||||||
make_cursor(b->mv_data, b->mv_data + b->mv_size, &cb);
|
make_cursor(b->mv_data, b->mv_data + b->mv_size, &cb);
|
||||||
|
|
||||||
|
// note_id
|
||||||
|
if (unlikely(!pull_varint(&ca, &nid_a) || !pull_varint(&cb, &nid_b)))
|
||||||
|
return 0;
|
||||||
|
|
||||||
// string size
|
// string size
|
||||||
if (unlikely(!pull_varint(&ca, &sa) || !pull_varint(&cb, &sb)))
|
if (unlikely(!pull_varint(&ca, &sa) || !pull_varint(&cb, &sb)))
|
||||||
return 0;
|
return 0;
|
||||||
@ -280,6 +296,10 @@ static int ndb_text_search_key_compare(const MDB_val *a, const MDB_val *b)
|
|||||||
if (sa < sb) return -1;
|
if (sa < sb) return -1;
|
||||||
else if (sa > sb) return 1;
|
else if (sa > sb) return 1;
|
||||||
|
|
||||||
|
// note_id
|
||||||
|
if (nid_a < nid_b) return -1;
|
||||||
|
else if (nid_a > nid_b) return 1;
|
||||||
|
|
||||||
// word index
|
// word index
|
||||||
if (unlikely(!pull_varint(&ca, &sa) || !pull_varint(&cb, &sb)))
|
if (unlikely(!pull_varint(&ca, &sa) || !pull_varint(&cb, &sb)))
|
||||||
return 0;
|
return 0;
|
||||||
@ -290,6 +310,17 @@ static int ndb_text_search_key_compare(const MDB_val *a, const MDB_val *b)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int ndb_unpack_text_search_key_noteid(
|
||||||
|
struct cursor *cur, uint64_t *note_id)
|
||||||
|
{
|
||||||
|
int inote_id;
|
||||||
|
if (!pull_varint(cur, &inote_id))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
*note_id = inote_id;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
// faster peek of just the string instead of unpacking everything
|
// faster peek of just the string instead of unpacking everything
|
||||||
// this is used to quickly discard range query matches if there is no
|
// this is used to quickly discard range query matches if there is no
|
||||||
// common prefix
|
// common prefix
|
||||||
@ -316,10 +347,10 @@ ndb_unpack_remaining_text_search_key(struct cursor *cur,
|
|||||||
{
|
{
|
||||||
int timestamp;
|
int timestamp;
|
||||||
|
|
||||||
if (!pull_varint(cur, &key->word_index))
|
if (!pull_varint(cur, ×tamp))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (!pull_varint(cur, ×tamp))
|
if (!pull_varint(cur, &key->word_index))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
key->timestamp = timestamp;
|
key->timestamp = timestamp;
|
||||||
@ -338,6 +369,9 @@ static inline int ndb_unpack_text_search_key(unsigned char *p, int len,
|
|||||||
struct cursor c;
|
struct cursor c;
|
||||||
make_cursor(p, p + len, &c);
|
make_cursor(p, p + len, &c);
|
||||||
|
|
||||||
|
if (!ndb_unpack_text_search_key_noteid(&c, &key->note_id))
|
||||||
|
return 0;
|
||||||
|
|
||||||
if (!ndb_unpack_text_search_key_string(&c, &key->str, &key->str_len))
|
if (!ndb_unpack_text_search_key_string(&c, &key->str, &key->str_len))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
@ -2266,7 +2300,8 @@ static int ndb_write_word_to_index(struct ndb_txn *txn, const char *word,
|
|||||||
|
|
||||||
// build our compressed text index key
|
// build our compressed text index key
|
||||||
if (!ndb_make_text_search_key(buffer, sizeof(buffer), word_index,
|
if (!ndb_make_text_search_key(buffer, sizeof(buffer), word_index,
|
||||||
word_len, word, timestamp, &keysize)) {
|
word_len, word, timestamp, note_id,
|
||||||
|
&keysize)) {
|
||||||
// probably too big
|
// probably too big
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@ -2275,8 +2310,8 @@ static int ndb_write_word_to_index(struct ndb_txn *txn, const char *word,
|
|||||||
k.mv_data = buffer;
|
k.mv_data = buffer;
|
||||||
k.mv_size = keysize;
|
k.mv_size = keysize;
|
||||||
|
|
||||||
v.mv_data = ¬e_id;
|
v.mv_data = NULL;
|
||||||
v.mv_size = sizeof(note_id);
|
v.mv_size = 0;
|
||||||
|
|
||||||
text_db = txn->lmdb->dbs[NDB_DB_NOTE_TEXT];
|
text_db = txn->lmdb->dbs[NDB_DB_NOTE_TEXT];
|
||||||
|
|
||||||
@ -2398,17 +2433,11 @@ static int ndb_parse_search_words(void *ctx, const char *word_str, int word_len,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ndb_keyed_text_search_results_init(struct ndb_keyed_text_search_results *r)
|
|
||||||
{
|
|
||||||
r->num_words = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ndb_search_words_init(struct ndb_search_words *words)
|
static void ndb_search_words_init(struct ndb_search_words *words)
|
||||||
{
|
{
|
||||||
words->num_words = 0;
|
words->num_words = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int prefix_count(const char *str1, int len1, const char *str2, int len2) {
|
static int prefix_count(const char *str1, int len1, const char *str2, int len2) {
|
||||||
int i, count = 0;
|
int i, count = 0;
|
||||||
int min_len = len1 < len2 ? len1 : len2;
|
int min_len = len1 < len2 ? len1 : len2;
|
||||||
@ -2428,31 +2457,34 @@ static int prefix_count(const char *str1, int len1, const char *str2, int len2)
|
|||||||
// This is called when scanning the full text search index. Scanning stops
|
// This is called when scanning the full text search index. Scanning stops
|
||||||
// when we no longer have a prefix match for the word
|
// when we no longer have a prefix match for the word
|
||||||
static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
|
static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
|
||||||
MDB_val *k, MDB_val *v, struct ndb_word *search_word,
|
MDB_val *k, struct ndb_word *search_word,
|
||||||
struct ndb_keyed_text_search_resultset *results)
|
struct ndb_text_search_result *last_result,
|
||||||
|
struct ndb_text_search_result *result)
|
||||||
{
|
{
|
||||||
struct ndb_keyed_text_search_result *result;
|
|
||||||
struct cursor key_cursor;
|
struct cursor key_cursor;
|
||||||
|
MDB_val v;
|
||||||
|
|
||||||
make_cursor(k->mv_data, k->mv_data + k->mv_size, &key_cursor);
|
make_cursor(k->mv_data, k->mv_data + k->mv_size, &key_cursor);
|
||||||
|
|
||||||
// if we won't have enough to store the result, then bail
|
|
||||||
if (results->num_results + 1 > MAX_RESULTS_PER_WORD)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// place to store the results if we get a successful one. num_results
|
|
||||||
// is incremented at the end if so
|
|
||||||
result = &results->results[results->num_results];
|
|
||||||
|
|
||||||
// When op is MDB_SET_RANGE, this initializes the search. Position
|
// When op is MDB_SET_RANGE, this initializes the search. Position
|
||||||
// the cursor at the next key greater than or equal to the specified
|
// the cursor at the next key greater than or equal to the specified
|
||||||
// key.
|
// key.
|
||||||
//
|
//
|
||||||
// Subsequent searches should use MDB_NEXT
|
// Subsequent searches should use MDB_NEXT
|
||||||
if (mdb_cursor_get(cursor, k, v, op))
|
if (mdb_cursor_get(cursor, k, &v, op))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
result->note_id = *((uint64_t*)v->mv_data);
|
make_cursor(k->mv_data, k->mv_data + k->mv_size, &key_cursor);
|
||||||
|
|
||||||
|
if (unlikely(!ndb_unpack_text_search_key_noteid(&key_cursor, &result->key.note_id))) {
|
||||||
|
fprintf(stderr, "UNUSUAL: failed to unpack text search key note_id\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last_result) {
|
||||||
|
if (last_result->key.note_id != result->key.note_id)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
// On success, this could still be not related at all.
|
// On success, this could still be not related at all.
|
||||||
// It could just be adjacent to the word. Let's check
|
// It could just be adjacent to the word. Let's check
|
||||||
@ -2462,7 +2494,6 @@ static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
|
|||||||
// unpack just the string to check the prefix. We don't
|
// unpack just the string to check the prefix. We don't
|
||||||
// need to unpack the entire key if the prefix doesn't
|
// need to unpack the entire key if the prefix doesn't
|
||||||
// match
|
// match
|
||||||
make_cursor(k->mv_data, k->mv_data + k->mv_size, &key_cursor);
|
|
||||||
if (!ndb_unpack_text_search_key_string(&key_cursor,
|
if (!ndb_unpack_text_search_key_string(&key_cursor,
|
||||||
&result->key.str,
|
&result->key.str,
|
||||||
&result->key.str_len)) {
|
&result->key.str_len)) {
|
||||||
@ -2472,19 +2503,20 @@ static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Empty strings shouldn't happen but let's
|
// Empty strings shouldn't happen but let's
|
||||||
// protect against these cases
|
if (result->key.str_len < 2 || search_word->word_len < 2)
|
||||||
if (result->key.str_len < 1)
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
// make sure we at least have a matching prefix. exact
|
// make sure we at least have two matching prefix characters. exact
|
||||||
// matches are nice but range searches allow us to
|
// matches are nice but range searches allow us to match prefixes as
|
||||||
// match prefixes as well. A single-char prefix is
|
// well. A double-char prefix is suffient, but maybe we could up this
|
||||||
// suffient, but maybe we could up this in the future.
|
// in the future.
|
||||||
//
|
//
|
||||||
// TODO: How are we handling utf-8 prefix matches like
|
// TODO: How are we handling utf-8 prefix matches like
|
||||||
// japanese?
|
// japanese?
|
||||||
//
|
//
|
||||||
if (result->key.str[0] != tolower(search_word->word[0]))
|
if ( result->key.str[0] != tolower(search_word->word[0])
|
||||||
|
&& result->key.str[1] != tolower(search_word->word[1])
|
||||||
|
)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
// count the number of prefix-matched characters. This will be used
|
// count the number of prefix-matched characters. This will be used
|
||||||
@ -2494,6 +2526,9 @@ static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
|
|||||||
search_word->word,
|
search_word->word,
|
||||||
search_word->word_len);
|
search_word->word_len);
|
||||||
|
|
||||||
|
if (result->prefix_chars <= (int)((double)search_word->word_len / 1.5))
|
||||||
|
return 0;
|
||||||
|
|
||||||
// Unpack the remaining text search key, we will need this information
|
// Unpack the remaining text search key, we will need this information
|
||||||
// when building up our search results.
|
// when building up our search results.
|
||||||
if (!ndb_unpack_remaining_text_search_key(&key_cursor, &result->key)) {
|
if (!ndb_unpack_remaining_text_search_key(&key_cursor, &result->key)) {
|
||||||
@ -2502,76 +2537,67 @@ static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// let's actually register this result now. the key has already been
|
if (last_result) {
|
||||||
// fully unpacked
|
if (result->key.word_index < last_result->key.word_index) {
|
||||||
results->num_results++;
|
/*
|
||||||
|
fprintf(stderr, "skipping '%.*s' because it is before last result '%.*s'\n",
|
||||||
return 1;
|
result->key.str_len, result->key.str,
|
||||||
}
|
last_result->key.str_len, last_result->key.str);
|
||||||
|
*/
|
||||||
static void ndb_print_text_search_key(struct ndb_text_search_key *key)
|
return 0;
|
||||||
{
|
|
||||||
fprintf(stderr, "K<'%.*s' %d %" PRIu64 ">", key->str_len, key->str,
|
|
||||||
key->word_index,
|
|
||||||
key->timestamp);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ndb_print_keyed_text_search_result(struct ndb_keyed_text_search_result *r)
|
|
||||||
{
|
|
||||||
ndb_print_text_search_key(&r->key);
|
|
||||||
fprintf(stderr, " note_id:%" PRIu64 "\n", r->note_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// convert the keyed results into actual results that include phrase results.
|
|
||||||
// This works by looping over all of the results
|
|
||||||
static int ndb_process_text_search_results(
|
|
||||||
struct ndb_keyed_text_search_results *keyed_results,
|
|
||||||
struct ndb_text_search_results *results)
|
|
||||||
{
|
|
||||||
int i, j;
|
|
||||||
struct ndb_keyed_text_search_resultset *resultset;
|
|
||||||
struct ndb_keyed_text_search_result *result;
|
|
||||||
|
|
||||||
// Phrase search. Find keyed adjacent keyed search results
|
|
||||||
for (i = 0; i < keyed_results->num_words; i++) {
|
|
||||||
resultset = &keyed_results->results[i];
|
|
||||||
for (j = 0; j < resultset->num_results; j++) {
|
|
||||||
result = &resultset->results[j];
|
|
||||||
ndb_print_keyed_text_search_result(result);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ndb_keyed_text_search_resultset_init(
|
static void ndb_print_text_search_key(struct ndb_text_search_key *key)
|
||||||
struct ndb_keyed_text_search_resultset *resultset) {
|
{
|
||||||
resultset->num_results = 0;
|
printf("K<'%.*s' %d %" PRIu64 " note_id:%" PRIu64 ">", key->str_len, key->str,
|
||||||
|
key->word_index,
|
||||||
|
key->timestamp,
|
||||||
|
key->note_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ndb_text_search_results_init(struct ndb_text_search_results *res)
|
static void ndb_print_text_search_result(struct ndb_txn *txn,
|
||||||
|
struct ndb_text_search_result *r)
|
||||||
{
|
{
|
||||||
res->num_phrase_results = 0;
|
size_t len;
|
||||||
res->num_other_results = 0;
|
struct ndb_note *note;
|
||||||
|
|
||||||
|
ndb_print_text_search_key(&r->key);
|
||||||
|
|
||||||
|
if (!(note = ndb_get_note_by_key(txn, r->key.note_id, &len))) {
|
||||||
|
printf(": note not found");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n%s\n\n---\n", ndb_note_str(note, ¬e->content).str);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ndb_text_search_results_init(
|
||||||
|
struct ndb_text_search_results *results) {
|
||||||
|
results->num_results = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ndb_text_search(struct ndb_txn *txn, const char *query,
|
int ndb_text_search(struct ndb_txn *txn, const char *query,
|
||||||
struct ndb_text_search_results *results)
|
struct ndb_text_search_results *results)
|
||||||
{
|
{
|
||||||
unsigned char buffer[1024];
|
unsigned char buffer[1024], *buf;
|
||||||
struct ndb_keyed_text_search_results keyed_results;
|
unsigned char saved_buf[1024], *saved;
|
||||||
struct ndb_keyed_text_search_resultset *keyed_resultset;
|
struct ndb_text_search_result *result, *last_result;
|
||||||
|
struct ndb_text_search_result candidate, last_candidate;
|
||||||
struct ndb_search_words search_words;
|
struct ndb_search_words search_words;
|
||||||
struct ndb_word *search_word;
|
struct ndb_word *search_word;
|
||||||
struct cursor cur;
|
struct cursor cur;
|
||||||
MDB_dbi text_db;
|
MDB_dbi text_db;
|
||||||
MDB_cursor *cursor;
|
MDB_cursor *cursor;
|
||||||
MDB_val k, v;
|
MDB_val k, v;
|
||||||
int i, j, keysize, num_words;
|
int i, j, keysize, saved_size;
|
||||||
MDB_cursor_op op = MDB_SET_RANGE;
|
MDB_cursor_op op;
|
||||||
//int num_note_ids;
|
//int num_note_ids;
|
||||||
|
|
||||||
ndb_keyed_text_search_results_init(&keyed_results);
|
saved = NULL;
|
||||||
ndb_text_search_results_init(results);
|
ndb_text_search_results_init(results);
|
||||||
ndb_search_words_init(&search_words);
|
ndb_search_words_init(&search_words);
|
||||||
|
|
||||||
@ -2580,6 +2606,8 @@ int ndb_text_search(struct ndb_txn *txn, const char *query,
|
|||||||
make_cursor((unsigned char *)query, (unsigned char *)query + strlen(query), &cur);
|
make_cursor((unsigned char *)query, (unsigned char *)query + strlen(query), &cur);
|
||||||
|
|
||||||
ndb_parse_words(&cur, &search_words, ndb_parse_search_words);
|
ndb_parse_words(&cur, &search_words, ndb_parse_search_words);
|
||||||
|
if (search_words.num_words == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
if ((i = mdb_cursor_open(txn->mdb_txn, text_db, &cursor))) {
|
if ((i = mdb_cursor_open(txn->mdb_txn, text_db, &cursor))) {
|
||||||
fprintf(stderr, "nd_text_search: mdb_cursor_open failed, error %d\n", i);
|
fprintf(stderr, "nd_text_search: mdb_cursor_open failed, error %d\n", i);
|
||||||
@ -2588,63 +2616,124 @@ int ndb_text_search(struct ndb_txn *txn, const char *query,
|
|||||||
|
|
||||||
fprintf(stderr, "search query: '%s'\n", query);
|
fprintf(stderr, "search query: '%s'\n", query);
|
||||||
|
|
||||||
// number of actual result words
|
|
||||||
// for every search word up to a max of MAX_TEXT_SEARCH_WORDS words (8)
|
|
||||||
|
|
||||||
num_words = min(search_words.num_words, MAX_TEXT_SEARCH_WORDS);
|
// for each word, we recursively find all of the submatches
|
||||||
|
while (results->num_results < MAX_TEXT_SEARCH_RESULTS) {
|
||||||
|
last_result = NULL;
|
||||||
|
result = &results->results[results->num_results];
|
||||||
|
|
||||||
for (i = 0; i < num_words; i++) {
|
// if we have saved, then we continue from the last root search
|
||||||
search_word = &search_words.words[i];
|
// sequence
|
||||||
|
if (saved) {
|
||||||
|
/*
|
||||||
|
fprintf(stderr, "continuing from ");
|
||||||
|
if (ndb_unpack_text_search_key(saved_buf, saved_size, &search_key)) {
|
||||||
|
ndb_print_text_search_key(&search_key);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
*/
|
||||||
|
buf = saved_buf;
|
||||||
|
saved = NULL;
|
||||||
|
keysize = saved_size;
|
||||||
|
|
||||||
// shouldn't happen but let's be defensive a bit
|
k.mv_data = buf;
|
||||||
if (search_word->word_len == 0)
|
k.mv_size = saved_size;
|
||||||
continue;
|
|
||||||
|
|
||||||
// construct a packed fulltext search key using this word
|
// reposition the cursor so we can continue
|
||||||
// this key doesn't contain any timestamp or index info,
|
if (mdb_cursor_get(cursor, &k, &v, MDB_SET_RANGE))
|
||||||
// so it should range match instead of exact match
|
break;
|
||||||
if (!ndb_make_text_search_key_low(buffer, sizeof(buffer),
|
|
||||||
search_word->word_len,
|
op = MDB_NEXT;
|
||||||
search_word->word,
|
} else {
|
||||||
&keysize)) {
|
// construct a packed fulltext search key using this
|
||||||
// word is too big to fit in 1024-sized key
|
// word this key doesn't contain any timestamp or index
|
||||||
continue;
|
// info, so it should range match instead of exact
|
||||||
|
// match
|
||||||
|
if (!ndb_make_text_search_key_low(
|
||||||
|
buffer, sizeof(buffer),
|
||||||
|
search_words.words[0].word_len,
|
||||||
|
search_words.words[0].word, &keysize))
|
||||||
|
{
|
||||||
|
// word is too big to fit in 1024-sized key
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
buf = buffer;
|
||||||
|
op = MDB_SET_RANGE;
|
||||||
}
|
}
|
||||||
|
|
||||||
k.mv_data = buffer;
|
for (j = 0; j < search_words.num_words; j++) {
|
||||||
k.mv_size = keysize;
|
search_word = &search_words.words[j];
|
||||||
op = MDB_SET_RANGE;
|
|
||||||
|
|
||||||
keyed_resultset = &keyed_results.results[keyed_results.num_words];
|
// shouldn't happen but let's be defensive a bit
|
||||||
ndb_keyed_text_search_resultset_init(keyed_resultset);
|
if (search_word->word_len == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
// try to match the word as many times as possible before
|
// if we already matched a note in this phrase, make
|
||||||
// moving on to the next word
|
// sure we're including the note id in the query
|
||||||
for (j = 0; j < MAX_RESULTS_PER_WORD; j++)
|
if (last_result) {
|
||||||
{
|
// we are narrowing down a search.
|
||||||
if (!ndb_text_search_next_word(cursor, op, &k, &v,
|
// if we already have this note id, just continue
|
||||||
|
for (i = 0; i < results->num_results; i++) {
|
||||||
|
if (results->results[i].key.note_id == last_result->key.note_id)
|
||||||
|
goto cont;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ndb_make_noted_text_search_key(
|
||||||
|
buffer, sizeof(buffer),
|
||||||
|
search_word->word_len,
|
||||||
|
search_word->word,
|
||||||
|
last_result->key.timestamp,
|
||||||
|
last_result->key.note_id,
|
||||||
|
&keysize))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
buf = buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
k.mv_data = buf;
|
||||||
|
k.mv_size = keysize;
|
||||||
|
|
||||||
|
if (!ndb_text_search_next_word(cursor, op, &k,
|
||||||
search_word,
|
search_word,
|
||||||
keyed_resultset)) {
|
last_result,
|
||||||
|
&candidate)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
op = MDB_NEXT;
|
*result = candidate;
|
||||||
|
op = MDB_SET_RANGE;
|
||||||
|
|
||||||
|
// save the first key match, since we will continue from
|
||||||
|
// this on the next root word result
|
||||||
|
if (j == 0 && !saved) {
|
||||||
|
memcpy(saved_buf, k.mv_data, k.mv_size);
|
||||||
|
saved = saved_buf;
|
||||||
|
saved_size = k.mv_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
last_candidate = *result;
|
||||||
|
last_result = &last_candidate;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (keyed_resultset->num_results > 0)
|
cont:
|
||||||
keyed_results.num_words++;
|
// we matched all of the queries!
|
||||||
|
if (j == search_words.num_words) {
|
||||||
/*
|
results->num_results++;
|
||||||
if ((note = ndb_get_note_by_key(txn, note_id, &len))) {
|
} else if (j == 0) {
|
||||||
fprintf(stderr, "found note: '%s' for query word '%.*s'\n",
|
break;
|
||||||
ndb_note_str(note, ¬e->content).str,
|
|
||||||
word->word_len, word->word);
|
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// convert our keyed results into
|
// print results for now
|
||||||
ndb_process_text_search_results(&keyed_results, results);
|
for (j = 0; j < results->num_results; j++) {
|
||||||
|
result = &results->results[j];
|
||||||
|
printf("[%02d] ", j+1);
|
||||||
|
ndb_print_text_search_result(txn, result);
|
||||||
|
}
|
||||||
|
|
||||||
mdb_cursor_close(cursor);
|
mdb_cursor_close(cursor);
|
||||||
|
|
||||||
|
@ -163,6 +163,30 @@ struct ndb_keypair {
|
|||||||
unsigned char pair[96];
|
unsigned char pair[96];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define MAX_TEXT_SEARCH_RESULTS 128
|
||||||
|
#define MAX_TEXT_SEARCH_WORDS 8
|
||||||
|
|
||||||
|
// unpacked form of the actual lmdb fulltext search key
|
||||||
|
// see `ndb_make_text_search_key` for how the packed version is constructed
|
||||||
|
struct ndb_text_search_key
|
||||||
|
{
|
||||||
|
int str_len;
|
||||||
|
const char *str;
|
||||||
|
uint64_t timestamp;
|
||||||
|
uint64_t note_id;
|
||||||
|
int word_index;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ndb_text_search_result {
|
||||||
|
struct ndb_text_search_key key;
|
||||||
|
int prefix_chars;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ndb_text_search_results {
|
||||||
|
struct ndb_text_search_result results[MAX_TEXT_SEARCH_RESULTS];
|
||||||
|
int num_results;
|
||||||
|
};
|
||||||
|
|
||||||
// these must be byte-aligned, they are directly accessing the serialized data
|
// these must be byte-aligned, they are directly accessing the serialized data
|
||||||
// representation
|
// representation
|
||||||
#pragma pack(push, 1)
|
#pragma pack(push, 1)
|
||||||
@ -271,25 +295,6 @@ struct ndb_filter {
|
|||||||
struct ndb_filter_elements *elements[NDB_NUM_FILTERS];
|
struct ndb_filter_elements *elements[NDB_NUM_FILTERS];
|
||||||
};
|
};
|
||||||
|
|
||||||
// unpacked form of the actual lmdb fulltext search key
|
|
||||||
// see `ndb_make_text_search_key` for how the packed version is constructed
|
|
||||||
struct ndb_text_search_key
|
|
||||||
{
|
|
||||||
int str_len;
|
|
||||||
const char *str;
|
|
||||||
int word_index;
|
|
||||||
uint64_t timestamp;
|
|
||||||
};
|
|
||||||
|
|
||||||
// contains note ids of the searched notes from ndb_text_search
|
|
||||||
struct ndb_text_search_results {
|
|
||||||
uint64_t phrase_results[32];
|
|
||||||
int num_phrase_results;
|
|
||||||
|
|
||||||
uint64_t other_results[32];
|
|
||||||
int num_other_results;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
// HELPERS
|
// HELPERS
|
||||||
int ndb_calculate_id(struct ndb_note *note, unsigned char *buf, int buflen);
|
int ndb_calculate_id(struct ndb_note *note, unsigned char *buf, int buflen);
|
||||||
|
Loading…
Reference in New Issue
Block a user