Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
150 commits
Select commit Hold shift + click to select a range
5189190
index pkg entries
sir-sigurd Jun 4, 2025
9792c46
CI
sir-sigurd Jun 4, 2025
d6bc135
lint
sir-sigurd Jun 4, 2025
fb0822c
fix decorator
sir-sigurd Jun 4, 2025
ef11bcb
use non-deprecated name
sir-sigurd Jun 4, 2025
bd11a04
do not index dir entries & add debugging
sir-sigurd Jun 5, 2025
25d6b5a
use prefix for pkg fields, parse pk
sir-sigurd Jun 10, 2025
69666d4
use hash for ids
sir-sigurd Jun 10, 2025
9ce314a
pointers as child docs, is_packaged field
sir-sigurd Jun 16, 2025
6136100
fix pkg_stats name
sir-sigurd Jun 17, 2025
99dd238
change fields names, fixes
sir-sigurd Jun 19, 2025
ec35e53
fix
sir-sigurd Jun 19, 2025
ef2b90f
fixes
sir-sigurd Jun 19, 2025
6708669
ignore manifests with invalid hashes
sir-sigurd Jun 19, 2025
941b664
fix total_files
sir-sigurd Jun 19, 2025
28e42d9
attempt to fix parent ids
nl0 Jun 20, 2025
bc0a58a
fix check
sir-sigurd Jun 20, 2025
b6cb8ef
do not store more than CHUNK_LIMIT_DOCS
sir-sigurd Jun 20, 2025
5354113
log errors
sir-sigurd Jun 23, 2025
4060819
small adjustments
sir-sigurd Jun 24, 2025
49616db
try http_compress
sir-sigurd Jun 24, 2025
3f02276
disable http_compress, increase CHUNK_LIMIT_DOCS
sir-sigurd Jun 24, 2025
9d3de8e
fix, increase CHUNK_LIMIT_DOCS
sir-sigurd Jun 24, 2025
561aa46
increase CHUNK_LIMIT_DOCS
sir-sigurd Jun 24, 2025
375b1eb
change log level
sir-sigurd Jun 24, 2025
3a16832
rework indexing
sir-sigurd Jun 25, 2025
c065497
fix
sir-sigurd Jun 25, 2025
0fc4c33
fix
sir-sigurd Jun 25, 2025
0368b10
fix
sir-sigurd Jun 25, 2025
acee91b
fix
sir-sigurd Jun 25, 2025
a493eda
fix
sir-sigurd Jun 25, 2025
c132d68
fix
sir-sigurd Jun 25, 2025
6821dda
fix
sir-sigurd Jun 26, 2025
7b50bcf
fix
sir-sigurd Jun 26, 2025
f565614
fix
sir-sigurd Jun 26, 2025
4c93b73
fix
sir-sigurd Jun 26, 2025
267563d
Revert "fix"
sir-sigurd Jun 26, 2025
f3426b4
fix
sir-sigurd Jun 26, 2025
70781c9
fix
sir-sigurd Jun 26, 2025
9b7b34f
fix
sir-sigurd Jun 26, 2025
e8acfcf
delete objects
sir-sigurd Jun 26, 2025
43140cb
test
sir-sigurd Jun 26, 2025
237f5c8
error handling
sir-sigurd Jun 27, 2025
05f4ae0
log retries
sir-sigurd Jun 27, 2025
11a66d4
wait more
sir-sigurd Jun 27, 2025
e7f3067
adjust backoff
sir-sigurd Jun 27, 2025
ea50e64
more aggresive backoff
sir-sigurd Jun 27, 2025
f79ba13
more aggresive backoff
sir-sigurd Jun 28, 2025
8358ed6
remove unneeded
sir-sigurd Jun 28, 2025
5cb85f5
sleep after slow responses
sir-sigurd Jun 28, 2025
9fca19b
more agressive backoff
sir-sigurd Jun 28, 2025
58119ef
more aggressive backoff
sir-sigurd Jun 28, 2025
68c59e7
more aggressive backoff
sir-sigurd Jun 28, 2025
720abb8
more aggressive backoff
sir-sigurd Jun 28, 2025
e43a5c6
more aggressive backoff
sir-sigurd Jun 29, 2025
4d4cd33
more
sir-sigurd Jun 29, 2025
61f5702
more
sir-sigurd Jun 29, 2025
91e9045
tmp
sir-sigurd Jun 30, 2025
6a01419
tmp
sir-sigurd Jun 30, 2025
ed48279
tmp
sir-sigurd Jun 30, 2025
7121784
tmp
sir-sigurd Jun 30, 2025
bc9c170
tmp
sir-sigurd Jun 30, 2025
431be7b
more work
sir-sigurd Jun 30, 2025
bdb01ab
remove unused stuff
sir-sigurd Jun 30, 2025
0f9ce90
remove fcsparser test data
sir-sigurd Jun 30, 2025
2c47d9f
Revert "remove fcsparser test data"
sir-sigurd Jun 30, 2025
1d4ea7e
try to build
sir-sigurd Jul 1, 2025
5ea51a0
fix path
sir-sigurd Jul 1, 2025
6ad380a
try to fix
sir-sigurd Jul 1, 2025
899a351
fix
sir-sigurd Jul 1, 2025
a62aadc
try to fix
sir-sigurd Jul 1, 2025
969d178
add test deps
sir-sigurd Jul 1, 2025
b0bd132
correct step name
sir-sigurd Jul 1, 2025
2f2938a
do not deploy thumnail
sir-sigurd Jul 1, 2025
74aa760
Merge branch 'master' into index-pkg-entries-2
sir-sigurd Jul 1, 2025
3dc3b41
isort
sir-sigurd Jul 1, 2025
3437f4f
fix some linting
sir-sigurd Jul 1, 2025
0209922
fix logging
sir-sigurd Jul 1, 2025
b67840d
add pytest-cov
sir-sigurd Jul 1, 2025
62f7efc
Merge branch 'master' into index-pkg-entries-2
sir-sigurd Jul 1, 2025
01b449f
revert CHUNK_LIMIT_DOCS
sir-sigurd Jul 1, 2025
e4f3c42
adjust get_es_client
sir-sigurd Jul 1, 2025
a286699
use updated quilt_shared.es
sir-sigurd Jul 1, 2025
2e0491a
polish/revert
sir-sigurd Jul 1, 2025
a4abc4c
update requirements.txt
sir-sigurd Jul 1, 2025
f9abe14
try to fix
sir-sigurd Jul 1, 2025
ee8b1a2
add prefix
sir-sigurd Jul 1, 2025
5bb1066
update lock
sir-sigurd Jul 1, 2025
11a0442
fix
sir-sigurd Jul 1, 2025
3f0047a
fix dep
sir-sigurd Jul 1, 2025
88ff687
move pytest-cov to test group
sir-sigurd Jul 1, 2025
ccd193d
cleanup
sir-sigurd Jul 1, 2025
25812f2
use orjson
sir-sigurd Jul 2, 2025
5edd631
bump py-shared
sir-sigurd Jul 2, 2025
03e138d
switch to orjson
sir-sigurd Jul 2, 2025
9be1c64
cleanup
sir-sigurd Jul 2, 2025
5a1856f
update requirements.txt
sir-sigurd Jul 2, 2025
97c9a25
fix some tests
sir-sigurd Jul 2, 2025
afdcf31
cleanup
sir-sigurd Jul 2, 2025
5252337
cleanup
sir-sigurd Jul 2, 2025
2f1119a
add test
sir-sigurd Jul 2, 2025
c4a35b5
remove log from quilt-shared
sir-sigurd Jul 2, 2025
573dc86
fix tests
sir-sigurd Jul 2, 2025
301d92a
cleanup
sir-sigurd Jul 2, 2025
eca588c
cleanup
sir-sigurd Jul 3, 2025
0e1f231
cleanup
sir-sigurd Jul 3, 2025
bb9fef7
add some tests
sir-sigurd Jul 3, 2025
a06b308
isort
sir-sigurd Jul 3, 2025
9569aab
remove duplicated test
sir-sigurd Jul 3, 2025
8c9b804
remove readmes
sir-sigurd Jul 3, 2025
88cf525
more tests
sir-sigurd Jul 3, 2025
f86ccd0
fix handling no version id
sir-sigurd Jul 3, 2025
59fded7
update and more tests
sir-sigurd Jul 3, 2025
4d0b6fd
use mocker
sir-sigurd Jul 3, 2025
d2d0b17
supress linter
sir-sigurd Jul 3, 2025
9c16074
remove leftovers
sir-sigurd Jul 3, 2025
5acf8d9
remove more leftovers
sir-sigurd Jul 3, 2025
9d6f6f6
linters
sir-sigurd Jul 3, 2025
3c2f213
add missing stats
sir-sigurd Jul 3, 2025
8adb4b8
fix test
sir-sigurd Jul 3, 2025
947c798
update base images
sir-sigurd Jul 3, 2025
dd9ab72
refresh lock
sir-sigurd Jul 3, 2025
02782d4
remove dummy description
sir-sigurd Jul 4, 2025
7611ee2
fix normalize_object_version_id
sir-sigurd Jul 4, 2025
e916221
bump
sir-sigurd Jul 4, 2025
67d4575
fix pointer deletion
sir-sigurd Jul 7, 2025
300c876
fix comment
sir-sigurd Jul 7, 2025
6abb662
remove usage of old name
sir-sigurd Jul 7, 2025
e35f475
send manifest event from indexer to manifest indexer queue
sir-sigurd Jul 8, 2025
1466340
fix
sir-sigurd Jul 8, 2025
cdeb9de
add timeout
sir-sigurd Jul 9, 2025
69333fc
fix
sir-sigurd Jul 9, 2025
a547751
fix tests
sir-sigurd Jul 9, 2025
c35a5bc
rework batcher
sir-sigurd Jul 9, 2025
a6e790a
use update quilt-shared
sir-sigurd Jul 9, 2025
76a002f
use const
sir-sigurd Jul 9, 2025
0251b9a
add changelogs
sir-sigurd Jul 9, 2025
f900b26
add changelog stuf for py-shared
sir-sigurd Jul 9, 2025
2fa4910
indexer changelog
sir-sigurd Jul 9, 2025
839064c
Merge branch 'master' into index-pkg-entries-2
sir-sigurd Jul 14, 2025
103bc82
update hash
sir-sigurd Jul 14, 2025
7a0e367
update locks
sir-sigurd Jul 14, 2025
5443f91
revert a bit
sir-sigurd Jul 14, 2025
a3c8a5e
add comment
sir-sigurd Jul 14, 2025
9801de7
revert CI
sir-sigurd Jul 14, 2025
11a6268
update some comments
sir-sigurd Jul 15, 2025
3a2ab47
adjust test
sir-sigurd Jul 15, 2025
bd63b07
handle versioned objects
sir-sigurd Jul 15, 2025
e8731ac
Revert "revert CI"
sir-sigurd Jul 15, 2025
697fd2e
Reapply "revert CI"
sir-sigurd Jul 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ updates:
schedule:
interval: "weekly"

- package-ecosystem: "docker"
directory: "/lambdas/indexer"
schedule:
interval: "daily"

- package-ecosystem: "docker"
directory: "/lambdas/thumbnail"
schedule:
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/deploy-lambdas.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ jobs:
matrix:
path:
- access_counts
- indexer
- es_ingest
- pkgevents
- pkgpush
- preview
- s3hash
- status_reports
- manifest_indexer
- tabular_preview
- transcode
runs-on: ubuntu-latest
Expand All @@ -29,6 +30,11 @@ jobs:
contents: read
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v6
- name: Export requirements.txt from uv.lock
if: ${{ hashFiles(format('lambdas/{0}/uv.lock', matrix.path)) != '' }}
run: |
uv export --locked --no-emit-project --no-hashes --directory lambdas/${{ matrix.path }} -o requirements.txt
- name: Build zip
run: |
BUILDER_IMAGE=quiltdata/lambda:build-$(cat lambdas/${{ matrix.path }}/.python-version)
Expand Down Expand Up @@ -64,6 +70,7 @@ jobs:
strategy:
matrix:
path:
- indexer
- thumbnail
runs-on: ubuntu-latest
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
Expand Down
10 changes: 9 additions & 1 deletion .github/workflows/py-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,9 @@ jobs:
matrix:
path:
- access_counts
- es_ingest
- indexer
- manifest_indexer
- pkgevents
- pkgpush
- preview
Expand All @@ -181,6 +183,12 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version-file: lambdas/${{ matrix.path }}/.python-version
- uses: astral-sh/setup-uv@v6
- name: Export requirements.txt from uv.lock
if: ${{ hashFiles(format('lambdas/{0}/uv.lock', matrix.path)) != '' }}
run: |
uv export --locked --no-emit-project --no-hashes --directory lambdas/${{ matrix.path }} -o requirements.txt
uv export --locked --no-emit-project --no-hashes --directory lambdas/${{ matrix.path }} --only-group test -o test-requirements.txt
- name: Install dependencies
run: |
if [ ${{ matrix.path }} == "thumbnail" ]
Expand All @@ -203,7 +211,7 @@ jobs:
python -m pip install -r lambdas/${{ matrix.path }}/test-requirements.txt
- name: Pytest
run: |
pytest --cov=lambdas lambdas/${{ matrix.path }}
python -m pytest --cov=lambdas lambdas/${{ matrix.path }}
- uses: codecov/codecov-action@v5
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
Expand Down
1 change: 1 addition & 0 deletions lambdas/es_ingest/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11
20 changes: 20 additions & 0 deletions lambdas/es_ingest/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<!-- markdownlint-disable line-length -->
# Changelog

Changes are listed in reverse chronological order (newer entries at the top).
The entry format is

```markdown
- [Verb] Change description ([#<PR-number>](https://github.com/quiltdata/quilt/pull/<PR-number>))
```

where verb is one of

- Removed
- Added
- Fixed
- Changed

## Changes

- [Added] Bootstrap the change log ([#4422](https://github.com/quiltdata/quilt/pull/4422))
30 changes: 30 additions & 0 deletions lambdas/es_ingest/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[project]
name = "t4_lambda_es_ingest"
version = "0.1.0"
authors = [
{ name = "Sergey Fedoseev", email = "[email protected]" }
]
requires-python = ">=3.11"
dependencies = [
"quilt-shared[boto,es]",
"t4-lambda-shared",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.uv.sources]
t4-lambda-shared = { url = "https://github.com/quiltdata/quilt/archive/d496dffbfb4b7a2ae05f6c1f7f0cb7d5d43bc984.zip", subdirectory = "lambdas/shared" }
quilt-shared = { url = "https://github.com/quiltdata/quilt/archive/df53c9ce125ea051e0d1ac41d58796336e202256.zip", subdirectory = "py-shared" }

[dependency-groups]
test = [
"pytest~=8.4",
"pytest-cov~=6.2",
"pytest-env~=1.1",
"pytest-mock~=3.14",
]

[tool.uv]
default-groups = ["test"]
6 changes: 6 additions & 0 deletions lambdas/es_ingest/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[pytest]
env =
ES_ENDPOINT = http://localhost:9200
AWS_ACCESS_KEY_ID = test_key
AWS_SECRET_ACCESS_KEY = test_secret
AWS_DEFAULT_REGION = us-east-1
90 changes: 90 additions & 0 deletions lambdas/es_ingest/src/t4_lambda_es_ingest/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import json
import os
import random
import time

import boto3
import elasticsearch

from quilt_shared.es import make_elastic
from t4_lambda_shared.utils import get_quilt_logger

s3_client = boto3.client("s3")
es = make_elastic(os.environ["ES_ENDPOINT"])
logger = get_quilt_logger()


EXPECTED_ES_RESPONSE_TIME = 10 # seconds


class BulkDocumentError(Exception):
pass


class TooManyRequestsError(Exception):
pass


def sleep_until_timeout(context):
"""Sleep until the lambda timeout"""
remaining = context.get_remaining_time_in_millis() / 1000 - 1
logger.warning("Sleeping for %s seconds just before lambda timeout", remaining)
# good night, sweet prince
time.sleep(remaining)


def bulk(context, es, data: bytes):
t0 = time.time()
try:
resp = es.bulk(
data,
filter_path="errors",
# wait as much as possible because it's better die trying than just die
# leave a second to avoid lambda timeout
request_timeout=context.get_remaining_time_in_millis() / 1000 - 1,
)
except elasticsearch.exceptions.TransportError as e:
if e.status_code == 429:
# at this point ES seems to be *very* overloaded, so we just sleep until lambda timeout
logger.warning("Got a 429 Too Many Requests error, sleeping until lambda timeout")
sleep_until_timeout(context)
raise TooManyRequestsError
raise

t1 = time.time()
delta = t1 - t0
logger.info("Bulk request took %s seconds", delta)
overtime = delta - EXPECTED_ES_RESPONSE_TIME
if overtime > 0:
# if the request took so long ES seems to be overloaded, so it's better to sleep
# now to avoid 429 Too Many Requests error later causing lambda failure and retry
time_to_sleep = min(
random.uniform(overtime / 2, overtime) + 15,
context.get_remaining_time_in_millis() / 1000 - 1,
)
logger.warning("Sleeping for %s seconds to avoid ES overload", time_to_sleep)
time.sleep(time_to_sleep)
if resp["errors"]:
# TODO: log errors from items.*.error?
# TODO: ignore index_not_found_exception for delete operations?
raise BulkDocumentError


def handler(event, context):
logger.debug("Invoked with event: %s", event)
assert len(event["Records"]) == 1, "Expected exactly on SQS message"
(event,) = event["Records"]
event = json.loads(event["body"])
assert len(event["Records"]) == 1, "Expected exactly one S3 event record"
(event,) = event["Records"]

bucket = event["s3"]["bucket"]["name"]
key = event["s3"]["object"]["key"]
version_id = event["s3"]["object"].get("versionId")
params = {"Bucket": bucket, "Key": key}
if version_id:
params["VersionId"] = version_id

data = s3_client.get_object(**params)["Body"].read()
bulk(context, es, data)
s3_client.delete_object(**params)
91 changes: 91 additions & 0 deletions lambdas/es_ingest/tests/test_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import json

import elasticsearch
import pytest
from botocore.stub import Stubber

import t4_lambda_es_ingest


def test_bulk_error(mocker):
mock_bulk = mocker.patch("elasticsearch.Elasticsearch.bulk", return_value={"errors": True})
mock_context = mocker.MagicMock()
with pytest.raises(t4_lambda_es_ingest.BulkDocumentError):
t4_lambda_es_ingest.bulk(mock_context, t4_lambda_es_ingest.es, b"data")

mock_bulk.assert_called_once_with(
b"data",
filter_path=mocker.ANY,
request_timeout=mocker.ANY,
)


def test_bulk_too_many_requests(mocker):
mocker.patch("elasticsearch.exceptions.TransportError.status_code", 429)
mock_bulk = mocker.patch("elasticsearch.Elasticsearch.bulk", side_effect=elasticsearch.exceptions.TransportError)
mock_context = mocker.MagicMock()
mock_sleep_until_timeout = mocker.patch("t4_lambda_es_ingest.sleep_until_timeout")

with pytest.raises(t4_lambda_es_ingest.TooManyRequestsError):
t4_lambda_es_ingest.bulk(mock_context, t4_lambda_es_ingest.es, b"data")

mock_bulk.assert_called_once_with(
b"data",
filter_path=mocker.ANY,
request_timeout=mocker.ANY,
)
mock_sleep_until_timeout.assert_called_once_with(mock_context)


@pytest.mark.parametrize("version_id", ["test-version-id", None])
def test_handler(mocker, version_id):
mock_context = mocker.MagicMock()
s3_record = {
"s3": {
"bucket": {"name": "test-bucket"},
"object": {
"key": "test-key",
},
}
}
if version_id:
s3_record["s3"]["object"]["versionId"] = version_id

mock_event = {
"Records": [
{"body": json.dumps({"Records": [s3_record]})},
]
}

with Stubber(t4_lambda_es_ingest.s3_client) as stubber:
mock_bulk = mocker.patch("t4_lambda_es_ingest.bulk")
get_object_params = {"Bucket": "test-bucket", "Key": "test-key"}
if version_id:
get_object_params["VersionId"] = version_id

stubber.add_response(
"get_object",
{
"Body": mocker.MagicMock(read=lambda: b"test data"),
"LastModified": "2023-10-01T00:00:00Z",
},
get_object_params,
)
stubber.add_response(
"delete_object",
{},
(
{"Bucket": "test-bucket", "Key": "test-key"}
if version_id is None
else {
"Bucket": "test-bucket",
"Key": "test-key",
"VersionId": version_id,
}
),
)

t4_lambda_es_ingest.handler(mock_event, mock_context)

stubber.assert_no_pending_responses()
mock_bulk.assert_called_once_with(mock_context, t4_lambda_es_ingest.es, b"test data")
Loading