nostrdb/Add fulltext search index

Signed-off-by: William Casarin <jb55@jb55.com>
2024-10-04 19:00:42 +00:00 · 2023-11-27 16:08:42 -08:00 · 2023-11-27 16:08:42 -08:00 · d541153e4c
commit d541153e4c
parent 53fc1b6945
2 changed files with 555 additions and 19 deletions
--- a/nostrdb/nostrdb.c
+++ b/nostrdb/nostrdb.c
@ -44,6 +44,8 @@ static const int DEFAULT_QUEUE_SIZE = 1000000;
 #define NDB_PARSED_ALL          (NDB_PARSED_ID|NDB_PARSED_PUBKEY|NDB_PARSED_SIG|NDB_PARSED_CREATED_AT|NDB_PARSED_KIND|NDB_PARSED_CONTENT|NDB_PARSED_TAGS)

 typedef int (*ndb_migrate_fn)(struct ndb *);
+typedef int (*ndb_word_parser_fn)(void *, const char *word, int word_len,
+				  int word_index);

 struct ndb_migration {
 	ndb_migrate_fn fn;
@ -133,6 +135,156 @@ struct ndb_u64_tsid {
 	uint64_t timestamp;
 };

+// uncompressed form of the actual lmdb key
+struct ndb_text_search_key
+{
+	int str_len;
+	const char *str;
+	int word_index;
+	uint64_t timestamp;
+};
+
+// ndb_text_search_key
+//
+// This is compressed when in lmdb:
+//
+//   strlen:     varint
+//   str:        cstr
+//   timestamp:  varint
+//   word_index: varint
+static int ndb_make_text_search_key(unsigned char *buf, int bufsize,
+				    int word_index, int word_len, const char *str,
+				    uint64_t timestamp, int *keysize)
+{
+	struct cursor cur;
+	int size, pad;
+	make_cursor(buf, buf + bufsize, &cur);
+
+	// string length
+	if (!push_varint(&cur, word_len))
+		return 0;
+
+	// non-null terminated string
+	if (!cursor_push(&cur, (unsigned char*)str, word_len))
+		return 0;
+
+	// the index of the word in the content so that we can do more accurate
+	// phrase searches
+	if (!push_varint(&cur, word_index))
+		return 0;
+
+	// TODO: need update this to uint64_t
+	if (!push_varint(&cur, (int)timestamp))
+		return 0;
+
+	size = cur.p - cur.start;
+
+	// pad to 8-byte alignment
+	pad = ((size + 7) & ~7) - size;
+	if (pad > 0) {
+		if (!cursor_memset(&cur, 0, pad)) {
+			return 0;
+		}
+	}
+
+	*keysize = cur.p - cur.start;
+	assert((*keysize % 8) == 0);
+
+	return 1;
+}
+
+static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize,
+					int wordlen, const char *word,
+					int *keysize)
+{
+	return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word, 0, keysize);
+}
+
+/** From LMDB: Compare two items lexically */
+static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) {
+	int diff;
+	ssize_t len_diff;
+	unsigned int len;
+
+	len = a->mv_size;
+	len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
+	if (len_diff > 0) {
+		len = b->mv_size;
+		len_diff = 1;
+	}
+
+	diff = memcmp(a->mv_data, b->mv_data, len);
+	return diff ? diff : len_diff<0 ? -1 : len_diff;
+}
+
+static int ndb_text_search_key_compare(const MDB_val *a, const MDB_val *b)
+{
+	struct cursor ca, cb;
+	int sa, sb;
+	MDB_val a2, b2;
+
+	make_cursor(a->mv_data, a->mv_data + a->mv_size, &ca);
+	make_cursor(b->mv_data, b->mv_data + b->mv_size, &cb);
+
+	// string size
+	if (unlikely(!pull_varint(&ca, &sa) || !pull_varint(&cb, &sb)))
+		return 0;
+
+	a2.mv_data = ca.p;
+	a2.mv_size = sa;
+
+	b2.mv_data = cb.p;
+	b2.mv_size = sb;
+
+	int cmp = mdb_cmp_memn(&a2, &b2);
+	if (cmp) return cmp;
+
+	// skip over string
+	ca.p += sa;
+	cb.p += sb;
+
+	// timestamp
+	if (unlikely(!pull_varint(&ca, &sa) || !pull_varint(&cb, &sb)))
+		return 0;
+
+	if      (sa < sb) return -1;
+	else if (sa > sb) return 1;
+
+	// word index
+	if (unlikely(!pull_varint(&ca, &sa) || !pull_varint(&cb, &sb)))
+		return 0;
+
+	if      (sa < sb) return -1;
+	else if (sa > sb) return 1;
+
+	return 0;
+}
+
+/*
+static int ndb_decompress_text_search_key(unsigned char *p, int len,
+					  struct ndb_text_search_key *key)
+{
+	struct cursor c;
+
+	make_cursor(p, p + len, &c);
+
+	if (!pull_varint(&c, &key->str_len))
+		return 0;
+
+	key->str = cur->p;
+
+	if (!cursor_skip(&c, key->str_len))
+		return 0;
+
+	if (!pull_varint(&c, &key->word_index))
+		return 0;
+
+	if (!pull_varint(&c, &key->timestamp))
+		return 0;
+
+}
+*/
+
 // Copies only lowercase characters to the destination string and fills the rest with null bytes.
 // `dst` and `src` are pointers to the destination and source strings, respectively.
 // `n` is the maximum number of characters to copy.
@ -742,23 +894,6 @@ int ndb_db_version(struct ndb *ndb)
 	return version;
 }

-/** From LMDB: Compare two items lexically */
-static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) {
-	int diff;
-	ssize_t len_diff;
-	unsigned int len;
-
-	len = a->mv_size;
-	len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
-	if (len_diff > 0) {
-		len = b->mv_size;
-		len_diff = 1;
-	}
-
-	diff = memcmp(a->mv_data, b->mv_data, len);
-	return diff ? diff : len_diff<0 ? -1 : len_diff;
-}
-
 // custom kind+timestamp comparison function. This is used by lmdb to perform
 // b+ tree searches over the kind+timestamp index
 static int ndb_u64_tsid_compare(const MDB_val *a, const MDB_val *b)
@ -814,10 +949,10 @@ static inline void ndb_tsid_init(struct ndb_tsid *key, unsigned char *id,
 	key->timestamp = timestamp;
 }

-static inline void ndb_u64_tsid_init(struct ndb_tsid *key, uint64_t integer,
+static inline void ndb_u64_tsid_init(struct ndb_u64_tsid *key, uint64_t integer,
 				     uint64_t timestamp)
 {
-	key->integer = integer;
+	key->u64 = integer;
 	key->timestamp = timestamp;
 }

@ -1877,6 +2012,388 @@ static int ndb_write_note_kind_index(struct ndb_txn *txn, struct ndb_note *note,
 	return 1;
 }

+/**
+  * Checks if a given Unicode code point is a punctuation character
+  *
+  * @param codepoint The Unicode code point to check. @return true if the
+  * code point is a punctuation character, false otherwise.
+  */
+static inline int is_punctuation(unsigned int codepoint) {
+	// Check for underscore (underscore is not treated as punctuation)
+	if (codepoint == '_')
+		return 0;
+
+	// Check for ASCII punctuation
+	if (ispunct(codepoint))
+		return 1;
+
+	// Check for Unicode punctuation exceptions (punctuation allowed in hashtags)
+	if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde
+		return 0;
+
+	// Check for Unicode punctuation
+	// NOTE: We may need to adjust the codepoint ranges in the future,
+	// to include/exclude certain types of Unicode characters in hashtags.
+	// Unicode Blocks Reference: https://www.compart.com/en/unicode/block
+	return (
+		// Latin-1 Supplement No-Break Space (NBSP): U+00A0
+		(codepoint == 0x00A0) ||
+
+		// Latin-1 Supplement Punctuation: U+00A1 to U+00BF
+		(codepoint >= 0x00A1 && codepoint <= 0x00BF) ||
+
+		// General Punctuation: U+2000 to U+206F
+		(codepoint >= 0x2000 && codepoint <= 0x206F) ||
+
+		// Currency Symbols: U+20A0 to U+20CF
+		(codepoint >= 0x20A0 && codepoint <= 0x20CF) ||
+
+		// Supplemental Punctuation: U+2E00 to U+2E7F
+		(codepoint >= 0x2E00 && codepoint <= 0x2E7F) ||
+
+		// CJK Symbols and Punctuation: U+3000 to U+303F
+		(codepoint >= 0x3000 && codepoint <= 0x303F) ||
+
+		// Ideographic Description Characters: U+2FF0 to U+2FFF
+		(codepoint >= 0x2FF0 && codepoint <= 0x2FFF)
+	);
+}
+
+static inline int is_whitespace(char c) {
+    return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static inline int is_right_boundary(int c) {
+	return is_whitespace(c) || is_punctuation(c);
+}
+
+static inline int parse_byte(struct cursor *cursor, unsigned char *c)
+{
+	if (unlikely(cursor->p >= cursor->end))
+		return 0;
+
+	*c = *cursor->p;
+
+	return 1;
+}
+
+static inline int peek_char(struct cursor *cur, int ind) {
+	if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end))
+		return -1;
+
+	return *(cur->p + ind);
+}
+
+static int parse_utf8_char(struct cursor *cursor, unsigned int *code_point,
+			   unsigned int *utf8_length)
+{
+	unsigned char first_byte;
+	if (!parse_byte(cursor, &first_byte))
+		return 0; // Not enough data
+
+	// Determine the number of bytes in this UTF-8 character
+	int remaining_bytes = 0;
+	if (first_byte < 0x80) {
+		*code_point = first_byte;
+		return 1;
+	} else if ((first_byte & 0xE0) == 0xC0) {
+		remaining_bytes = 1;
+		*utf8_length = remaining_bytes + 1;
+		*code_point = first_byte & 0x1F;
+	} else if ((first_byte & 0xF0) == 0xE0) {
+		remaining_bytes = 2;
+		*utf8_length = remaining_bytes + 1;
+		*code_point = first_byte & 0x0F;
+	} else if ((first_byte & 0xF8) == 0xF0) {
+		remaining_bytes = 3;
+		*utf8_length = remaining_bytes + 1;
+		*code_point = first_byte & 0x07;
+	} else {
+		remaining_bytes = 0;
+		*utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters
+		// TODO: We need to gracefully handle unrecognized UTF-8 characters
+		//printf("Invalid UTF-8 byte: %x\n", *code_point);
+		*code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation
+		return 0; // Invalid first byte
+	}
+
+	// Peek at remaining bytes
+	for (int i = 0; i < remaining_bytes; ++i) {
+		signed char next_byte;
+		if ((next_byte = peek_char(cursor, i+1)) == -1) {
+			*utf8_length = 1;
+			return 0; // Not enough data
+		}
+
+		if ((next_byte & 0xC0) != 0x80) {
+			*utf8_length = 1;
+			return 0; // Invalid byte in sequence
+		}
+
+		*code_point = (*code_point << 6) | (next_byte & 0x3F);
+	}
+
+	return 1;
+}
+
+
+static inline int is_utf8_byte(unsigned char c) {
+    return c & 0x80;
+}
+
+static inline int consume_until_boundary(struct cursor *cur) {
+	unsigned int c;
+	unsigned int char_length = 1;
+	unsigned int *utf8_char_length = &char_length;
+
+	while (cur->p < cur->end) {
+		c = *cur->p;
+		*utf8_char_length = 1;
+
+		if (is_whitespace(c))
+			return 1;
+
+		// Need to check for UTF-8 characters, which can be multiple
+		// bytes long
+		if (is_utf8_byte(c)) {
+			if (!parse_utf8_char(cur, &c, utf8_char_length)) {
+				if (!is_right_boundary(c)){
+					// TODO: We should work towards
+					// handling all UTF-8 characters.
+					//printf("Invalid UTF-8 code point: %x\n", c);
+				}
+			}
+		}
+
+		if (is_right_boundary(c))
+			return 1;
+
+		// Need to use a variable character byte length for UTF-8 (2-4 bytes)
+		if (cur->p + *utf8_char_length <= cur->end)
+			cur->p += *utf8_char_length;
+		else
+			cur->p++;
+	}
+
+	return 1;
+}
+
+static void consume_whitespace_or_punctuation(struct cursor *cur)
+{
+	while (cur->p < cur->end) {
+		if (!is_right_boundary(*cur->p))
+			return;
+		cur->p++;
+	}
+}
+
+static int ndb_write_word_to_index(struct ndb_txn *txn, const char *word,
+				   int word_len, int word_index,
+				   uint64_t timestamp, uint64_t note_id)
+{
+	// cap to some reasonable key size
+	unsigned char buffer[1024];
+	int keysize, rc;
+	MDB_val k, v;
+	MDB_dbi text_db;
+
+	// build our compressed text index key
+	if (!ndb_make_text_search_key(buffer, sizeof(buffer), word_index,
+				      word_len, word, timestamp, &keysize)) {
+		// probably too big
+
+		return 0;
+	}
+
+	k.mv_data = buffer;
+	k.mv_size = keysize;
+
+	v.mv_data = &note_id;
+	v.mv_size = sizeof(note_id);
+
+	text_db = txn->lmdb->dbs[NDB_DB_NOTE_TEXT];
+
+	if ((rc = mdb_put(txn->mdb_txn, text_db, &k, &v, 0))) {
+		ndb_debug("write note text index to db failed: %s\n",
+				mdb_strerror(rc));
+		return 0;
+	}
+
+	return 1;
+}
+
+
+
+static int ndb_parse_words(struct cursor *cur, void *ctx, ndb_word_parser_fn fn)
+{
+	int word_len, words;
+	const char *word;
+
+	words = 0;
+
+	while (cur->p < cur->end) {
+		consume_whitespace_or_punctuation(cur);
+		if (cur->p >= cur->end)
+			break;
+		word = (const char *)cur->p;
+
+		if (!consume_until_boundary(cur))
+			break;
+
+		// start of word or end
+		word_len = cur->p - (unsigned char *)word;
+		if (word_len == 0 && cur->p >= cur->end)
+			break;
+
+		if (!fn(ctx, word, word_len, words))
+			continue;
+
+		words++;
+	}
+
+	return 1;
+}
+
+struct ndb_word_writer_ctx
+{
+	struct ndb_txn *txn;
+	struct ndb_note *note;
+	uint64_t note_id;
+};
+
+static int ndb_fulltext_word_writer(void *ctx,
+		const char *word, int word_len, int words)
+{
+	struct ndb_word_writer_ctx *wctx = ctx;
+
+	if (!ndb_write_word_to_index(wctx->txn, word, word_len, words,
+				     wctx->note->created_at, wctx->note_id)) {
+		// too big to write this one, just skip it
+		ndb_debug(stderr, "failed to write word '%.*s' to index\n", word_len, word);
+
+		return 0;
+	}
+
+	//fprintf(stderr, "wrote '%.*s' to note text index\n", word_len, word);
+	return 1;
+}
+
+static int ndb_write_note_fulltext_index(struct ndb_txn *txn,
+					 struct ndb_note *note,
+					 uint64_t note_id)
+{
+	struct cursor cur;
+	unsigned char *content;
+	struct ndb_str str;
+	struct ndb_word_writer_ctx ctx;
+
+	str = ndb_note_str(note, &note->content);
+	// I don't think this should happen?
+	if (unlikely(str.flag == NDB_PACKED_ID))
+		return 0;
+
+	content = (unsigned char *)str.str;
+
+	make_cursor(content, content + note->content_length, &cur);
+
+	ctx.txn = txn;
+	ctx.note = note;
+	ctx.note_id = note_id;
+
+	ndb_parse_words(&cur, &ctx, ndb_fulltext_word_writer);
+
+	return 1;
+}
+
+struct ndb_word
+{
+	const char *word;
+	int word_len;
+};
+
+#define MAX_SEARCH_WORDS 16
+
+struct ndb_search_words
+{
+	struct ndb_word words[MAX_SEARCH_WORDS];
+	int num_words;
+};
+
+static int ndb_parse_search_words(void *ctx, const char *word_str, int word_len, int word_index)
+{
+	struct ndb_search_words *words = ctx;
+	struct ndb_word *word;
+
+	if (words->num_words + 1 > MAX_SEARCH_WORDS)
+		return 0;
+
+	word = &words->words[words->num_words++];
+	word->word = word_str;
+	word->word_len = word_len;
+
+	return 1;
+}
+
+int ndb_text_search(struct ndb_txn *txn, const char *query)
+{
+	unsigned char buffer[1024];
+	struct ndb_search_words words;
+	struct ndb_word *word;
+	struct cursor cur;
+	MDB_dbi text_db;
+	MDB_cursor *cursor;
+	MDB_val k, v;
+	int i, rc, keysize;
+	size_t len;
+	//uint64_t note_ids[32], note_id;
+	uint64_t note_id;
+	struct ndb_note *note;
+	//int num_note_ids;
+	
+	//num_note_ids = 0;
+	text_db = txn->lmdb->dbs[NDB_DB_NOTE_TEXT];
+	make_cursor((unsigned char *)query, (unsigned char *)query + strlen(query), &cur);
+	words.num_words = 0;
+
+	ndb_parse_words(&cur, &words, ndb_parse_search_words);
+
+	if ((rc = mdb_cursor_open(txn->mdb_txn, text_db, &cursor))) {
+		fprintf(stderr, "nd_text_search: mdb_cursor_open failed, error %d\n", rc);
+		return 0;
+	}
+
+	for (i = 0; i < words.num_words; i++) {
+		word = &words.words[i];
+		fprintf(stderr, "search word %.*s\n", word->word_len, word->word);
+
+		if (!ndb_make_text_search_key_low(buffer, sizeof(buffer),
+						  word->word_len, word->word,
+						  &keysize)) {
+			// word is too big to fit in 1024-sized key
+			continue;
+		}
+
+		k.mv_data = buffer;
+		k.mv_size = keysize;
+
+		// Position cursor at the next key greater than or equal to the specified key
+		if (mdb_cursor_get(cursor, &k, &v, MDB_SET_RANGE)) {
+			continue;
+		} else {
+			//note_ids[num_note_ids++] = *((uint64_t*)v.mv_data);
+			note_id = *((uint64_t*)v.mv_data);
+			if ((note = ndb_get_note_by_key(txn, note_id, &len))) {
+				fprintf(stderr, "found note: '%s' for query word '%.*s'\n",
+					ndb_note_str(note, &note->content).str,
+					word->word_len, word->word);
+			}
+			return 1;
+		}
+	}
+
+	return 1;
+}
+
 static uint64_t ndb_write_note(struct ndb_txn *txn,
 			       struct ndb_writer_note *note)
 {
@ -1910,6 +2427,12 @@ static uint64_t ndb_write_note(struct ndb_txn *txn,
 	if (!ndb_write_note_kind_index(txn, note->note, note_key))
 		return 0;

+	// only do fulltext index on kind1 notes
+	if (note->note->kind == 1) {
+		if (!ndb_write_note_fulltext_index(txn, note->note, note_key))
+			return 0;
+	}
+
 	if (note->note->kind == 7) {
 		ndb_write_reaction_stats(txn, note->note);
 	}
@ -2282,6 +2805,12 @@ static int ndb_init_lmdb(const char *filename, struct ndb_lmdb *lmdb, size_t map
 	}
 	mdb_set_compare(txn, lmdb->dbs[NDB_DB_NOTE_KIND], ndb_u64_tsid_compare);

+	if ((rc = mdb_dbi_open(txn, "note_text", tsid_flags, &lmdb->dbs[NDB_DB_NOTE_TEXT]))) {
+		fprintf(stderr, "mdb_dbi_open id failed: %s\n", mdb_strerror(rc));
+		return 0;
+	}
+	mdb_set_compare(txn, lmdb->dbs[NDB_DB_NOTE_TEXT], ndb_text_search_key_compare);
+
 	// Commit the transaction
 	if ((rc = mdb_txn_commit(txn))) {
 		fprintf(stderr, "mdb_txn_commit failed, error %d\n", rc);
--- a/nostrdb/nostrdb.h
+++ b/nostrdb/nostrdb.h
@ -42,6 +42,7 @@ enum ndb_dbs {
 	NDB_DB_PROFILE_SEARCH,
 	NDB_DB_PROFILE_LAST_FETCH,
 	NDB_DB_NOTE_KIND, // note kind index
+	NDB_DB_NOTE_TEXT, // note fulltext index
 	NDB_DBS,
 };

@ -327,6 +328,10 @@ void ndb_filter_reset(struct ndb_filter *);
 void ndb_filter_end_field(struct ndb_filter *);
 void ndb_filter_free(struct ndb_filter *filter);

+
+// FULLTEXT SEARCH
+int ndb_text_search(struct ndb_txn *, const char *query);
+
 // stats
 int ndb_stat(struct ndb *ndb, struct ndb_stat *stat);
 void ndb_stat_counts_init(struct ndb_stat_counts *counts);
@ -528,6 +533,8 @@ ndb_db_name(enum ndb_dbs db)
 			return "profile_last_fetch";
 		case NDB_DB_NOTE_KIND:
 			return "note_kind_index";
+		case NDB_DB_NOTE_TEXT:
+			return "note_fulltext";
 		case NDB_DBS:
 			return "count";
 	}