Skip to content

Commit cf737ce

Browse files
author
Jash Shah
committed
Add GIN index support for text array metadata in PostgreSQL vector store
Fixes #20128 Enable automatic GIN index creation for text array metadata fields in the PostgreSQL vector store. This allows fast array membership queries using PostgreSQL operators (?|, ?&, @>) without requiring raw SQL workarounds. Changes: - Extended PGType to support text[] array type - Modified get_data_model() to create GIN indices for array fields and BTREE indices for scalar fields - Cast metadata to JSONB in GIN index creation for compatibility with both JSON and JSONB columns - Added comprehensive tests covering GIN index creation, CONTAINS, ANY, ALL operators, and mixed BTREE/GIN queries - Tests use PostgreSQL system catalogs (pg_class, pg_am) for type-safe index verification This maintains backward compatibility - existing BTREE indices continue working, and GIN indices are only created when text[] is specified in indexed_metadata_keys.
1 parent e403da9 commit cf737ce

File tree

3 files changed

+574
-5
lines changed

3 files changed

+574
-5
lines changed

llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/llama_index/vector_stores/postgres/base.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
"date",
5050
"timestamp",
5151
"uuid",
52+
# Array type for GIN indexing
53+
"text[]",
5254
]
5355

5456

@@ -81,6 +83,7 @@ def get_data_model(
8183
from pgvector.sqlalchemy import Vector
8284
from sqlalchemy import Column, Computed
8385
from sqlalchemy.dialects.postgresql import (
86+
ARRAY,
8487
BIGINT,
8588
JSON,
8689
JSONB,
@@ -105,6 +108,8 @@ def get_data_model(
105108
"date": Date,
106109
"timestamp": DateTime,
107110
"uuid": UUID,
111+
# Array type for GIN indexing
112+
"text[]": ARRAY(String),
108113
}
109114

110115
indexed_metadata_keys = indexed_metadata_keys or set()
@@ -131,15 +136,33 @@ class TSVector(TypeDecorator):
131136
else:
132137
embedding_col = Column(Vector(embed_dim)) # type: ignore
133138

134-
metadata_indices = [
139+
# BTREE indices for scalar types (existing behavior)
140+
btree_indices = [
135141
Index(
136142
f"{indexname}_{key}_{pg_type.replace(' ', '_')}",
137143
cast(column("metadata_").op("->>")(key), pg_type_map[pg_type]),
138144
postgresql_using="btree",
139145
)
140146
for key, pg_type in indexed_metadata_keys
147+
if pg_type != "text[]"
141148
]
142149

150+
# GIN indices for text arrays (enables fast array operations with ?|, ?&, @> operators)
151+
gin_indices = [
152+
Index(
153+
f"{indexname}_{key}_gin",
154+
cast(
155+
column("metadata_").op("->")(key), JSONB
156+
), # Cast to JSONB for GIN index compatibility
157+
postgresql_using="gin",
158+
)
159+
for key, pg_type in indexed_metadata_keys
160+
if pg_type == "text[]"
161+
]
162+
163+
# Combine both types of indices
164+
metadata_indices = btree_indices + gin_indices
165+
143166
if hybrid_search:
144167

145168
class HybridAbstractData(base): # type: ignore
@@ -677,9 +700,9 @@ def _build_filter_clause(self, filter_: MetadataFilter) -> Any:
677700
f"({filter_value})"
678701
)
679702
elif filter_.operator in [FilterOperator.ANY, FilterOperator.ALL]:
680-
# Expects a list stored in the metadata, and a single value to compare
681-
682-
# We apply same logic as above, but as an array
703+
# Expects a text array stored in the metadata, and a list of values to compare
704+
# Works with text[] arrays using PostgreSQL ?| (ANY) and ?& (ALL) operators
705+
# Example: metadata_::jsonb->'tags' ?| array['AI', 'ML']
683706
filter_value = ", ".join(f"'{e}'" for e in filter_.value)
684707

685708
return text(

llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ dev = [
2727

2828
[project]
2929
name = "llama-index-vector-stores-postgres"
30-
version = "0.7.0"
30+
version = "0.7.1"
3131
description = "llama-index vector_stores postgres integration"
3232
authors = [{name = "Your Name", email = "[email protected]"}]
3333
requires-python = ">=3.9,<4.0"

0 commit comments

Comments
 (0)