Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions src/core/search/base.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
#pragma once

#include <algorithm>
#include <iostream>
#include <memory>
#include <ostream>
#include <regex>
#include <variant>
#include <any>
#include <string>
#include <vector>

#include "core/core_types.h"
Expand Down
30 changes: 15 additions & 15 deletions src/core/search/search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -264,34 +264,34 @@ struct BasicSearch {
} // namespace

FieldIndices::FieldIndices(Schema schema) : schema_{move(schema)}, all_ids_{}, indices_{} {
for (auto& [field, type] : schema_.fields) {
switch (type) {
case Schema::TAG:
indices_[field] = make_unique<TagIndex>();
for (const auto& [field_name, field_info] : schema_.fields) {
switch (field_info.type) {
case SchemaField::TAG:
indices_[field_name] = make_unique<TagIndex>();
break;
case Schema::TEXT:
indices_[field] = make_unique<TextIndex>();
case SchemaField::TEXT:
indices_[field_name] = make_unique<TextIndex>();
break;
case Schema::NUMERIC:
indices_[field] = make_unique<NumericIndex>();
case SchemaField::NUMERIC:
indices_[field_name] = make_unique<NumericIndex>();
break;
case Schema::VECTOR:
indices_[field] = make_unique<VectorIndex>();
case SchemaField::VECTOR:
indices_[field_name] = make_unique<VectorIndex>();
break;
}
}
}

void FieldIndices::Add(DocId doc, DocumentAccessor* access) {
for (auto& [field, index] : indices_) {
index->Add(doc, access, field);
index->Add(doc, access, schema_.fields[field].identifier);
}
all_ids_.insert(upper_bound(all_ids_.begin(), all_ids_.end(), doc), doc);
}

void FieldIndices::Remove(DocId doc, DocumentAccessor* access) {
for (auto& [field, index] : indices_) {
index->Remove(doc, access, field);
index->Remove(doc, access, schema_.fields[field].identifier);
}
auto it = lower_bound(all_ids_.begin(), all_ids_.end(), doc);
CHECK(it != all_ids_.end() && *it == doc);
Expand All @@ -305,10 +305,10 @@ BaseIndex* FieldIndices::GetIndex(string_view field) const {

std::vector<TextIndex*> FieldIndices::GetAllTextIndices() const {
vector<TextIndex*> out;
for (auto& [field, type] : schema_.fields) {
if (type != Schema::TEXT)
for (auto& [field_name, field_info] : schema_.fields) {
if (field_info.type != SchemaField::TEXT)
continue;
auto* index = dynamic_cast<TextIndex*>(GetIndex(field));
auto* index = dynamic_cast<TextIndex*>(GetIndex(field_name));
DCHECK(index);
out.push_back(index);
}
Expand Down
9 changes: 7 additions & 2 deletions src/core/search/search.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@ namespace dfly::search {
struct AstNode;
struct TextIndex;

struct Schema {
struct SchemaField {
enum FieldType { TAG, TEXT, NUMERIC, VECTOR };

absl::flat_hash_map<std::string, FieldType> fields;
std::string identifier;
FieldType type;
};

struct Schema {
absl::flat_hash_map<std::string, SchemaField> fields;
};

// Collection of indices for all fields in schema
Expand Down
26 changes: 17 additions & 9 deletions src/core/search/search_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,18 @@ struct MockedDocument : public DocumentAccessor {
Map fields_{};
};

Schema MakeSimpleSchema(initializer_list<pair<string_view, SchemaField::FieldType>> ilist) {
Schema schema;
for (auto [name, type] : ilist) {
schema.fields[name] = {string{name}, type};
}
return schema;
}

class SearchParserTest : public ::testing::Test {
protected:
SearchParserTest() {
PrepareSchema();
PrepareSchema({{"field", SchemaField::TEXT}});
}

~SearchParserTest() {
Expand All @@ -82,8 +90,8 @@ class SearchParserTest : public ::testing::Test {
params_.knn_vec = vec;
}

void PrepareSchema(Schema schema = {{{"field", Schema::TEXT}}}) {
schema_ = schema;
void PrepareSchema(initializer_list<pair<string_view, SchemaField::FieldType>> ilist) {
schema_ = MakeSimpleSchema(ilist);
}

void PrepareQuery(string_view query) {
Expand Down Expand Up @@ -248,7 +256,7 @@ TEST_F(SearchParserTest, CheckParenthesisPriority) {
using Map = MockedDocument::Map;

TEST_F(SearchParserTest, MatchField) {
PrepareSchema({{{"f1", Schema::TEXT}, {"f2", Schema::TEXT}, {"f3", Schema::TEXT}}});
PrepareSchema({{"f1", SchemaField::TEXT}, {"f2", SchemaField::TEXT}, {"f3", SchemaField::TEXT}});
PrepareQuery("@f1:foo @f2:bar @f3:baz");

ExpectAll(Map{{"f1", "foo"}, {"f2", "bar"}, {"f3", "baz"}});
Expand All @@ -260,7 +268,7 @@ TEST_F(SearchParserTest, MatchField) {
}

TEST_F(SearchParserTest, MatchRange) {
PrepareSchema({{{"f1", Schema::NUMERIC}, {"f2", Schema::NUMERIC}}});
PrepareSchema({{"f1", SchemaField::NUMERIC}, {"f2", SchemaField::NUMERIC}});
PrepareQuery("@f1:[1 10] @f2:[50 100]");

ExpectAll(Map{{"f1", "5"}, {"f2", "50"}}, Map{{"f1", "1"}, {"f2", "100"}},
Expand All @@ -277,7 +285,7 @@ TEST_F(SearchParserTest, MatchStar) {
}

TEST_F(SearchParserTest, CheckExprInField) {
PrepareSchema({{{"f1", Schema::TEXT}, {"f2", Schema::TEXT}, {"f3", Schema::TEXT}}});
PrepareSchema({{"f1", SchemaField::TEXT}, {"f2", SchemaField::TEXT}, {"f3", SchemaField::TEXT}});
{
PrepareQuery("@f1:(a|b) @f2:(c d) @f3:-e");

Expand All @@ -300,7 +308,7 @@ TEST_F(SearchParserTest, CheckExprInField) {
}

TEST_F(SearchParserTest, CheckTag) {
PrepareSchema({{{"f1", Schema::TAG}, {"f2", Schema::TAG}}});
PrepareSchema({{"f1", SchemaField::TAG}, {"f2", SchemaField::TAG}});

PrepareQuery("@f1:{red | blue} @f2:{circle | square}");

Expand All @@ -316,7 +324,7 @@ TEST_F(SearchParserTest, CheckTag) {
}

TEST_F(SearchParserTest, SimpleKnn) {
Schema schema{{{"even", Schema::TAG}, {"pos", Schema::VECTOR}}};
auto schema = MakeSimpleSchema({{"even", SchemaField::TAG}, {"pos", SchemaField::VECTOR}});
FieldIndices indices{schema};

// Place points on a straight line
Expand Down Expand Up @@ -366,7 +374,7 @@ TEST_F(SearchParserTest, Simple2dKnn) {
// 0 1
const pair<float, float> kTestCoords[] = {{0, 0}, {1, 0}, {1, 1}, {0, 1}, {0.5, 0.5}};

Schema schema{{{"pos", Schema::VECTOR}}};
auto schema = MakeSimpleSchema({{"pos", SchemaField::VECTOR}});
FieldIndices indices{schema};

for (size_t i = 0; i < ABSL_ARRAYSIZE(kTestCoords); i++) {
Expand Down
41 changes: 37 additions & 4 deletions src/server/search/doc_accessors.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <absl/strings/str_join.h>

#include <jsoncons/json.hpp>
#include <jsoncons_ext/jsonpath/jsonpath.hpp>

#include "core/json_object.h"
#include "core/search/search.h"
Expand Down Expand Up @@ -55,7 +56,7 @@ SearchDocData ListPackAccessor::Serialize(search::Schema schema) const {
string_view v = container_utils::LpGetView(fptr, intbuf_[1].data());
fptr = lpNext(lp_, fptr);

if (schema.fields.at(k) == search::Schema::VECTOR)
if (schema.fields.at(k).type == search::SchemaField::VECTOR)
out[k] = FtVectorToString(GetVector(k));
else
out[k] = v;
Expand All @@ -78,7 +79,7 @@ SearchDocData StringMapAccessor::Serialize(search::Schema schema) const {
string_view k = SdsToSafeSv(kptr);
string_view v = SdsToSafeSv(vptr);

if (schema.fields.at(k) == search::Schema::VECTOR)
if (schema.fields.at(k).type == search::SchemaField::VECTOR)
out[k] = FtVectorToString(GetVector(k));
else
out[k] = v;
Expand All @@ -87,18 +88,47 @@ SearchDocData StringMapAccessor::Serialize(search::Schema schema) const {
return out;
}

struct JsonAccessor::JsonPathContainer : public jsoncons::jsonpath::jsonpath_expression<JsonType> {
};

string_view JsonAccessor::GetString(string_view active_field) const {
buf_ = json_->get_value_or<string>(active_field, string{});
auto res = GetPath(active_field)->evaluate(*json_);
DCHECK(res.is_array());
if (res.empty())
return "";
buf_ = res[0].as_string();
return buf_;
}

search::FtVector JsonAccessor::GetVector(string_view active_field) const {
auto res = GetPath(active_field)->evaluate(*json_);
DCHECK(res.is_array());
if (res.empty())
return {};

search::FtVector out;
for (auto v : json_->at(active_field).array_range())
for (auto v : res[0].array_range())
out.push_back(v.as<float>());
return out;
}

JsonAccessor::JsonPathContainer* JsonAccessor::GetPath(std::string_view field) const {
if (auto it = path_cache_.find(field); it != path_cache_.end()) {
return it->second.get();
} else {
error_code ec;
auto path_expr = jsoncons::jsonpath::make_expression<JsonType>(field, ec);
DCHECK(!ec) << "missing validation on ft.create step";

JsonPathContainer path_container{move(path_expr)};
auto ptr = make_unique<JsonPathContainer>(move(path_container));

JsonPathContainer* path = ptr.get();
path_cache_[field] = move(ptr);
return path;
}
}

SearchDocData JsonAccessor::Serialize(search::Schema schema) const {
SearchDocData out{};
for (const auto& member : json_->object_range()) {
Expand All @@ -107,6 +137,9 @@ SearchDocData JsonAccessor::Serialize(search::Schema schema) const {
return out;
}

thread_local absl::flat_hash_map<std::string, std::unique_ptr<JsonAccessor::JsonPathContainer>>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need unique_ptr for the value_type here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want to pull in json dependencies into the header, so I forward declared JsonPathContainer. Actually pulling in json deps is not crucial as this header is used only in search internal files 🤔

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having global caches with no eviction and no limit seems dangerous, memory wise.
Could we turn this into an LRU? Or bind this caching to the lifetime of the query?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those are field names defined by developers. Each schema has usually a few fields, and there are usually no more than a few indices per database. Having them occupy kilobytes of memory would require developers to type out kilobytes of field names which is really unlikely

We can clear this cache once indices are deleted

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, they can't query for arbitrary fields outside of established indices? Somehow I thought it's possible but just slower.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, its only used for accessing an index - and that index must have been explicitly described by the user

JsonAccessor::path_cache_;

Comment on lines +144 to +146
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've chosen to use this tl cache instead of polluting the core json parts with json stuff. Alternatively we can choose to either wire json to the core or use just a generic member like std::any access_helper_ to get by without a type

unique_ptr<BaseAccessor> GetAccessor(const DbContext& db_cntx, const PrimeValue& pv) {
DCHECK(pv.ObjType() == OBJ_HASH || pv.ObjType() == OBJ_JSON);

Expand Down
10 changes: 9 additions & 1 deletion src/server/search/doc_accessors.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ struct StringMapAccessor : public BaseAccessor {

// Accessor for json values
struct JsonAccessor : public BaseAccessor {
struct JsonPathContainer; // contains jsoncons::jsonpath::jsonpath_expression

explicit JsonAccessor(JsonType* json) : json_{json} {
}

Expand All @@ -63,8 +65,14 @@ struct JsonAccessor : public BaseAccessor {
SearchDocData Serialize(search::Schema schema) const override;

private:
mutable std::string buf_;
JsonPathContainer* GetPath(std::string_view field) const;

JsonType* json_;
mutable std::string buf_;

// Contains built json paths to avoid parsing them repeatedly
static thread_local absl::flat_hash_map<std::string, std::unique_ptr<JsonPathContainer>>
path_cache_;
};

// Get accessor for value
Expand Down
Loading