From d6df5c65b1330891d61427303b2267ca7d83cc1b Mon Sep 17 00:00:00 2001 From: Doug Hoyte Date: Wed, 8 Feb 2023 05:46:40 -0500 Subject: [PATCH] Revert "allow filtering for indexed values > 255 bytes" This reverts commit 93ca4b904433d6b778500a424e5be4788864f71d. --- README.md | 2 +- src/filters.h | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 2dc117f..17b1b8b 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ A `FilterGroup` is a vector of `Filter` objects. When the Ingester receives a `R In order to determine if an event matches against a `Filter`, first the `since` and `until` fields are checked. Then, each field of the event for which a filter item was specified is looked up in the corresponding lookup table. Specifically, the upper-bound index is determined using a binary search (for example `std::upper_bound`). This is the first element greater than the event's item. Then the preceeding table item is checked for either a prefix (`ids`/`authors`) or exact (everything else) match. -Since testing `Filter`s against events is performed so frequently, it is a performance-critical operation and some optimisations have been applied. For example, each filter item in the lookup table is represented by a 8 byte data structure, one of which is the first byte of the field and the rest are offset/size lookups into a single memory allocation containing the remaining bytes. Under typical scenarios, this will greatly reduce the amount of memory that needs to be loaded to process a filter. Filters with 8 or fewer items can often be rejected with the load of a single cache line. Because filters aren't scanned linearly, the number of items in a filter (ie amount of pubkeys) doesn't have a significant impact on processing resources. +Since testing `Filter`s against events is performed so frequently, it is a performance-critical operation and some optimisations have been applied. For example, each filter item in the lookup table is represented by a 4 byte data structure, one of which is the first byte of the field and the rest are offset/size lookups into a single memory allocation containing the remaining bytes. Under typical scenarios, this will greatly reduce the amount of memory that needs to be loaded to process a filter. Filters with 16 or fewer items can often be rejected with the load of a single cache line. Because filters aren't scanned linearly, the number of items in a filter (ie amount of pubkeys) doesn't have a significant impact on processing resources. #### DBScan diff --git a/src/filters.h b/src/filters.h index 25040f9..c81c954 100644 --- a/src/filters.h +++ b/src/filters.h @@ -7,10 +7,9 @@ struct FilterSetBytes { struct Item { - uint32_t offset; - uint16_t size; + uint16_t offset; + uint8_t size; uint8_t firstByte; - uint8_t padding; }; std::vector items; @@ -19,8 +18,6 @@ struct FilterSetBytes { // Sizes are post-hex decode FilterSetBytes(const tao::json::value &arrHex, bool hexDecode, size_t minSize, size_t maxSize) { - if (maxSize > std::numeric_limits::max()) throw herr("filter maxSize too big"); - std::vector arr; for (const auto &i : arrHex.get_array()) { @@ -34,11 +31,11 @@ struct FilterSetBytes { for (const auto &item : arr) { if (items.size() > 0 && item.starts_with(at(items.size() - 1))) continue; // remove duplicates and redundant prefixes - items.emplace_back(Item{ (uint32_t)buf.size(), (uint16_t)item.size(), (uint8_t)item[0] }); + items.emplace_back(Item{ (uint16_t)buf.size(), (uint8_t)item.size(), (uint8_t)item[0] }); buf += item; } - if (buf.size() > 1'000'000) throw herr("total filter items too large"); + if (buf.size() > 65535) throw herr("total filter items too large"); } std::string at(size_t n) const {