Skip to content

Commit 845167a

Browse files
committed
perf: smart catalog attributes fetching
Improves fetching for catalog attributes. The previous implementation used threshold to prevent `414 Request-URI Too Large`. If the threshold was reached the whole catalog was fetched. The new implementation calls get_attributes_catalog multiple times with different rsql_filter. JIRA: STL-1036 risk: low
1 parent 51f0cf4 commit 845167a

File tree

6 files changed

+335
-13
lines changed

6 files changed

+335
-13
lines changed

gooddata-pandas/gooddata_pandas/data_access.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
ObjId,
1717
TableDimension,
1818
)
19-
from gooddata_sdk.utils import IdObjType, filter_for_attributes_labels
19+
from gooddata_sdk.utils import IdObjType
2020

2121
from gooddata_pandas.utils import (
2222
ColumnsDef,
@@ -26,6 +26,7 @@
2626
_to_attribute,
2727
_to_item,
2828
_typed_attribute_value,
29+
get_catalog_attributes_for_extract,
2930
)
3031

3132

@@ -446,12 +447,7 @@ def compute_and_extract(
446447
if not exec_def.has_attributes():
447448
return _extract_for_metrics_only(response, cols, col_to_metric_idx), dict()
448449
else:
449-
filter_query = filter_for_attributes_labels(exec_def.attributes)
450-
# if there is to many labels then all attributes are fetched and no rsql filter is used
451-
# it prevention again 414 Request-URI Too Long
452-
attributes = sdk.catalog_workspace_content.get_attributes_catalog(
453-
workspace_id, include=["labels", "datasets"], rsql_filter=filter_query
454-
)
450+
attributes = get_catalog_attributes_for_extract(sdk, workspace_id, exec_def.attributes)
455451
return _extract_from_attributes_and_maybe_metrics(
456452
response,
457453
attributes,

gooddata-pandas/gooddata_pandas/utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
from gooddata_sdk import (
1010
Attribute,
1111
CatalogAttribute,
12+
GoodDataSdk,
1213
Metric,
1314
ObjId,
1415
SimpleMetric,
1516
VisualizationAttribute,
1617
VisualizationMetric,
1718
)
1819
from gooddata_sdk.type_converter import AttributeConverterStore, DateConverter, DatetimeConverter, IntegerConverter
20+
from gooddata_sdk.utils import filter_for_attributes_labels
1921
from pandas import Index, MultiIndex
2022

2123
LabelItemDef = Union[Attribute, ObjId, str]
@@ -29,6 +31,25 @@
2931
DatetimeConverter.set_external_fnc(lambda self, value: pandas.to_datetime(value))
3032

3133

34+
def get_catalog_attributes_for_extract(
35+
sdk: GoodDataSdk, workspace_id: str, attributes: list[Attribute], character_limit: int = 1500
36+
) -> list[CatalogAttribute]:
37+
"""
38+
Get catalog attributes for the given attributes.
39+
It uses the filter_for_attributes_labels function to get the
40+
RSQL queries for the attributes and then fetches the catalog attributes for the given workspace.
41+
This approach prevents loading all catalog attributes providing significant speed-up.
42+
"""
43+
rsql_queries = filter_for_attributes_labels(attributes, character_limit)
44+
return [
45+
attr
46+
for query in rsql_queries
47+
for attr in sdk.catalog_workspace_content.get_attributes_catalog(
48+
workspace_id, include=["labels", "datasets"], rsql_filter=query
49+
)
50+
]
51+
52+
3253
def _unique_local_id() -> str:
3354
"""
3455
Generate unique local ID of a DataItem without dashes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# (C) 2025 GoodData Corporation
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
# (C) 2025 GoodData Corporation
2+
version: 1
3+
interactions:
4+
- request:
5+
method: GET
6+
uri: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3Din%3D%28campaign_name%29&page=0&size=500
7+
body: null
8+
headers:
9+
Accept:
10+
- application/vnd.gooddata.api+json
11+
Accept-Encoding:
12+
- br, gzip, deflate
13+
X-GDC-VALIDATE-RELATIONS:
14+
- 'true'
15+
X-Requested-With:
16+
- XMLHttpRequest
17+
response:
18+
status:
19+
code: 200
20+
message: OK
21+
headers:
22+
Access-Control-Allow-Credentials:
23+
- 'true'
24+
Access-Control-Expose-Headers:
25+
- Content-Disposition, Content-Length, Content-Range, Set-Cookie
26+
Cache-Control:
27+
- no-cache, no-store, max-age=0, must-revalidate
28+
Connection:
29+
- keep-alive
30+
Content-Length:
31+
- '1541'
32+
Content-Security-Policy:
33+
- 'default-src ''self'' *.wistia.com *.wistia.net; script-src ''self'' ''unsafe-inline''
34+
''unsafe-eval'' *.wistia.com *.wistia.net *.hsforms.net *.hsforms.com
35+
src.litix.io matomo.anywhere.gooddata.com *.jquery.com unpkg.com cdnjs.cloudflare.com;
36+
img-src * data: blob:; style-src ''self'' ''unsafe-inline'' fonts.googleapis.com
37+
cdn.jsdelivr.net fast.fonts.net; font-src ''self'' data: fonts.gstatic.com
38+
*.alicdn.com *.wistia.com cdn.jsdelivr.net info.gooddata.com; frame-src
39+
''self'' *.hsforms.net *.hsforms.com; object-src ''none''; worker-src
40+
''self'' blob:; child-src blob:; connect-src ''self'' *.tiles.mapbox.com
41+
*.mapbox.com *.litix.io *.wistia.com *.hsforms.net *.hsforms.com embedwistia-a.akamaihd.net
42+
matomo.anywhere.gooddata.com; media-src ''self'' blob: data: *.wistia.com
43+
*.wistia.net embedwistia-a.akamaihd.net'
44+
Content-Type:
45+
- application/vnd.gooddata.api+json
46+
DATE: &id001
47+
- PLACEHOLDER
48+
Expires:
49+
- '0'
50+
GoodData-Deployment:
51+
- aio
52+
Permission-Policy:
53+
- geolocation 'none'; midi 'none'; sync-xhr 'none'; microphone 'none'; camera
54+
'none'; magnetometer 'none'; gyroscope 'none'; fullscreen 'none'; payment
55+
'none';
56+
Pragma:
57+
- no-cache
58+
Referrer-Policy:
59+
- no-referrer
60+
Server:
61+
- nginx
62+
Vary:
63+
- Origin
64+
- Access-Control-Request-Method
65+
- Access-Control-Request-Headers
66+
X-Content-Type-Options:
67+
- nosniff
68+
X-GDC-TRACE-ID: *id001
69+
X-XSS-Protection:
70+
- '0'
71+
set-cookie:
72+
- SPRING_REDIRECT_URI=; Max-Age=0; Expires=Mon, 20 Jan 2025 10:18:23 GMT;
73+
Path=/; HTTPOnly; SameSite=Lax
74+
body:
75+
string:
76+
data:
77+
- id: campaign_name
78+
type: attribute
79+
attributes:
80+
title: Campaign name
81+
description: Campaign name
82+
tags:
83+
- Campaigns
84+
areRelationsValid: true
85+
sourceColumn: campaign_name
86+
sourceColumnDataType: STRING
87+
relationships:
88+
dataset:
89+
data:
90+
id: campaigns
91+
type: dataset
92+
labels:
93+
data:
94+
- id: campaign_name
95+
type: label
96+
links:
97+
self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes/campaign_name
98+
meta:
99+
origin:
100+
originType: NATIVE
101+
originId: demo
102+
included:
103+
- id: campaigns
104+
type: dataset
105+
attributes:
106+
title: Campaigns
107+
description: Campaigns
108+
tags:
109+
- Campaigns
110+
grain:
111+
- id: campaign_id
112+
type: attribute
113+
dataSourceTableId: demo-test-ds:campaigns
114+
dataSourceTablePath:
115+
- demo
116+
- campaigns
117+
type: NORMAL
118+
links:
119+
self: http://localhost:3000/api/v1/entities/workspaces/demo/datasets/campaigns
120+
- id: campaign_name
121+
type: label
122+
attributes:
123+
title: Campaign name
124+
description: Campaign name
125+
tags:
126+
- Campaigns
127+
primary: true
128+
sourceColumn: campaign_name
129+
sourceColumnDataType: STRING
130+
valueType: TEXT
131+
links:
132+
self: http://localhost:3000/api/v1/entities/workspaces/demo/labels/campaign_name
133+
links:
134+
self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27campaign_name%27&page=0&size=500
135+
next: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27campaign_name%27&page=1&size=500
136+
- request:
137+
method: GET
138+
uri: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3Din%3D%28region%29&page=0&size=500
139+
body: null
140+
headers:
141+
Accept:
142+
- application/vnd.gooddata.api+json
143+
Accept-Encoding:
144+
- br, gzip, deflate
145+
X-GDC-VALIDATE-RELATIONS:
146+
- 'true'
147+
X-Requested-With:
148+
- XMLHttpRequest
149+
response:
150+
status:
151+
code: 200
152+
message: OK
153+
headers:
154+
Access-Control-Allow-Credentials:
155+
- 'true'
156+
Access-Control-Expose-Headers:
157+
- Content-Disposition, Content-Length, Content-Range, Set-Cookie
158+
Cache-Control:
159+
- no-cache, no-store, max-age=0, must-revalidate
160+
Connection:
161+
- keep-alive
162+
Content-Length:
163+
- '1450'
164+
Content-Security-Policy:
165+
- 'default-src ''self'' *.wistia.com *.wistia.net; script-src ''self'' ''unsafe-inline''
166+
''unsafe-eval'' *.wistia.com *.wistia.net *.hsforms.net *.hsforms.com
167+
src.litix.io matomo.anywhere.gooddata.com *.jquery.com unpkg.com cdnjs.cloudflare.com;
168+
img-src * data: blob:; style-src ''self'' ''unsafe-inline'' fonts.googleapis.com
169+
cdn.jsdelivr.net fast.fonts.net; font-src ''self'' data: fonts.gstatic.com
170+
*.alicdn.com *.wistia.com cdn.jsdelivr.net info.gooddata.com; frame-src
171+
''self'' *.hsforms.net *.hsforms.com; object-src ''none''; worker-src
172+
''self'' blob:; child-src blob:; connect-src ''self'' *.tiles.mapbox.com
173+
*.mapbox.com *.litix.io *.wistia.com *.hsforms.net *.hsforms.com embedwistia-a.akamaihd.net
174+
matomo.anywhere.gooddata.com; media-src ''self'' blob: data: *.wistia.com
175+
*.wistia.net embedwistia-a.akamaihd.net'
176+
Content-Type:
177+
- application/vnd.gooddata.api+json
178+
DATE: *id001
179+
Expires:
180+
- '0'
181+
GoodData-Deployment:
182+
- aio
183+
Permission-Policy:
184+
- geolocation 'none'; midi 'none'; sync-xhr 'none'; microphone 'none'; camera
185+
'none'; magnetometer 'none'; gyroscope 'none'; fullscreen 'none'; payment
186+
'none';
187+
Pragma:
188+
- no-cache
189+
Referrer-Policy:
190+
- no-referrer
191+
Server:
192+
- nginx
193+
Vary:
194+
- Origin
195+
- Access-Control-Request-Method
196+
- Access-Control-Request-Headers
197+
X-Content-Type-Options:
198+
- nosniff
199+
X-GDC-TRACE-ID: *id001
200+
X-XSS-Protection:
201+
- '0'
202+
set-cookie:
203+
- SPRING_REDIRECT_URI=; Max-Age=0; Expires=Mon, 20 Jan 2025 10:18:23 GMT;
204+
Path=/; HTTPOnly; SameSite=Lax
205+
body:
206+
string:
207+
data:
208+
- id: region
209+
type: attribute
210+
attributes:
211+
title: Region
212+
description: Region
213+
tags:
214+
- Customers
215+
areRelationsValid: true
216+
sourceColumn: region
217+
sourceColumnDataType: STRING
218+
relationships:
219+
dataset:
220+
data:
221+
id: customers
222+
type: dataset
223+
labels:
224+
data:
225+
- id: region
226+
type: label
227+
links:
228+
self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes/region
229+
meta:
230+
origin:
231+
originType: NATIVE
232+
originId: demo
233+
included:
234+
- id: customers
235+
type: dataset
236+
attributes:
237+
title: Customers
238+
description: Customers
239+
tags:
240+
- Customers
241+
grain:
242+
- id: customer_id
243+
type: attribute
244+
dataSourceTableId: demo-test-ds:customers
245+
dataSourceTablePath:
246+
- demo
247+
- customers
248+
type: NORMAL
249+
links:
250+
self: http://localhost:3000/api/v1/entities/workspaces/demo/datasets/customers
251+
- id: region
252+
type: label
253+
attributes:
254+
title: Region
255+
description: Region
256+
tags:
257+
- Customers
258+
primary: true
259+
sourceColumn: region
260+
sourceColumnDataType: STRING
261+
valueType: TEXT
262+
links:
263+
self: http://localhost:3000/api/v1/entities/workspaces/demo/labels/region
264+
links:
265+
self: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27region%27&page=0&size=500
266+
next: http://localhost:3000/api/v1/entities/workspaces/demo/attributes?include=labels%2Cdatasets&filter=labels.id%3D%3D%27region%27&page=1&size=500
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# (C) 2025 GoodData Corporation
2+
from pathlib import Path
3+
4+
from gooddata_pandas.utils import get_catalog_attributes_for_extract
5+
from gooddata_sdk import (
6+
Attribute,
7+
GoodDataSdk,
8+
)
9+
from tests_support.vcrpy_utils import get_vcr
10+
11+
gd_vcr = get_vcr()
12+
13+
_current_dir = Path(__file__).parent.absolute()
14+
_fixtures_dir = _current_dir / "fixtures"
15+
16+
17+
@gd_vcr.use_cassette(str(_fixtures_dir / "test_get_catalog_attributes_for_extract.yaml"))
18+
def test_get_catalog_attributes_for_extract(test_config):
19+
sdk = GoodDataSdk.create(host_=test_config["host"], token_=test_config["token"])
20+
workspace_id = "demo"
21+
attributes = [Attribute(local_id="0", label="campaign_name"), Attribute(local_id="1", label="region")]
22+
catalog_attributes = get_catalog_attributes_for_extract(sdk, workspace_id, attributes, character_limit=28)
23+
assert len(catalog_attributes) == 2
24+
assert [ca.id for ca in catalog_attributes] == ["campaign_name", "region"]

0 commit comments

Comments
 (0)