Skip to content

Commit 81e0189

Browse files
authored
perf: move the core of the combine logic to be entirely in SQL (#2033)
This shaves roughly 40% of the runtime off of pyca/cryptography's combine.
1 parent 049a112 commit 81e0189

File tree

1 file changed

+128
-121
lines changed

1 file changed

+128
-121
lines changed

coverage/sqldata.py

Lines changed: 128 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,22 @@ def _wrapped(self: CoverageData, *args: Any, **kwargs: Any) -> Any:
122122
return _wrapped
123123

124124

125+
class NumbitsUnionAgg:
126+
"""SQLite aggregate function for computing union of numbits."""
127+
128+
def __init__(self) -> None:
129+
self.result = b""
130+
131+
def step(self, value: bytes) -> None:
132+
"""Process one value in the aggregation."""
133+
if value:
134+
self.result = numbits_union(self.result, value)
135+
136+
def finalize(self) -> bytes:
137+
"""Return the final aggregated result."""
138+
return self.result
139+
140+
125141
class CoverageData:
126142
"""Manages collected coverage data, including file storage.
127143
@@ -676,146 +692,137 @@ def update(
676692

677693
# Force the database we're writing to to exist before we start nesting contexts.
678694
self._start_using()
679-
680-
# Collector for all arcs, lines and tracers
681695
other_data.read()
682-
with other_data._connect() as con:
683-
# Get files data.
684-
with con.execute("select path from file") as cur:
685-
files = {path: map_path(path) for (path,) in cur}
686-
687-
# Get contexts data.
688-
with con.execute("select context from context") as cur:
689-
contexts = cur.fetchall()
690-
691-
# Get arc data.
692-
with con.execute(
693-
"select file.path, context.context, arc.fromno, arc.tono " +
694-
"from arc " +
695-
"inner join file on file.id = arc.file_id " +
696-
"inner join context on context.id = arc.context_id",
697-
) as cur:
698-
arcs = [
699-
(files[path], context, fromno, tono)
700-
for (path, context, fromno, tono) in cur
701-
]
702-
703-
# Get line data.
704-
with con.execute(
705-
"select file.path, context.context, line_bits.numbits " +
706-
"from line_bits " +
707-
"inner join file on file.id = line_bits.file_id " +
708-
"inner join context on context.id = line_bits.context_id",
709-
) as cur:
710-
lines: dict[tuple[str, str], bytes] = {}
711-
for path, context, numbits in cur:
712-
key = (files[path], context)
713-
if key in lines:
714-
numbits = numbits_union(lines[key], numbits)
715-
lines[key] = numbits
716-
717-
# Get tracer data.
718-
with con.execute(
719-
"select file.path, tracer " +
720-
"from tracer " +
721-
"inner join file on file.id = tracer.file_id",
722-
) as cur:
723-
tracers = {files[path]: tracer for (path, tracer) in cur}
696+
697+
# Ensure other_data has a properly initialized database
698+
with other_data._connect():
699+
pass
724700

725701
with self._connect() as con:
726702
assert con.con is not None
727703
con.con.isolation_level = "IMMEDIATE"
728704

729-
# Get all tracers in the DB. Files not in the tracers are assumed
730-
# to have an empty string tracer. Since Sqlite does not support
731-
# full outer joins, we have to make two queries to fill the
732-
# dictionary.
733-
with con.execute("select path from file") as cur:
734-
this_tracers = {path: "" for path, in cur}
735-
with con.execute(
736-
"select file.path, tracer from tracer " +
737-
"inner join file on file.id = tracer.file_id",
738-
) as cur:
739-
this_tracers.update({
740-
map_path(path): tracer
741-
for path, tracer in cur
742-
})
743-
744-
# Create all file and context rows in the DB.
745-
con.executemany_void(
746-
"insert or ignore into file (path) values (?)",
747-
[(file,) for file in files.values()],
748-
)
749-
with con.execute("select id, path from file") as cur:
750-
file_ids = {path: id for id, path in cur}
751-
self._file_map.update(file_ids)
752-
con.executemany_void(
753-
"insert or ignore into context (context) values (?)",
754-
contexts,
705+
# Register functions for SQLite
706+
con.con.create_function("numbits_union", 2, numbits_union)
707+
con.con.create_function("map_path", 1, map_path)
708+
con.con.create_aggregate(
709+
"numbits_union_agg", 1, NumbitsUnionAgg # type: ignore[arg-type]
755710
)
756-
with con.execute("select id, context from context") as cur:
757-
context_ids = {context: id for id, context in cur}
758-
759-
# Prepare tracers and fail, if a conflict is found.
760-
# tracer_paths is used to ensure consistency over the tracer data
761-
# and tracer_map tracks the tracers to be inserted.
762-
tracer_map = {}
763-
for path in files.values():
764-
this_tracer = this_tracers.get(path)
765-
other_tracer = tracers.get(path, "")
766-
# If there is no tracer, there is always the None tracer.
767-
if this_tracer is not None and this_tracer != other_tracer:
711+
712+
# Attach the other database
713+
con.execute_void("ATTACH DATABASE ? AS other_db", (other_data.data_filename(),))
714+
715+
# Create temporary table with mapped file paths to avoid repeated map_path() calls
716+
con.execute_void("""
717+
CREATE TEMP TABLE other_file_mapped AS
718+
SELECT
719+
other_file.id as other_file_id,
720+
map_path(other_file.path) as mapped_path
721+
FROM other_db.file AS other_file
722+
""")
723+
724+
# Check for tracer conflicts before proceeding
725+
with con.execute("""
726+
SELECT other_file_mapped.mapped_path,
727+
COALESCE(main.tracer.tracer, ''),
728+
COALESCE(other_db.tracer.tracer, '')
729+
FROM main.file
730+
LEFT JOIN main.tracer ON main.file.id = main.tracer.file_id
731+
INNER JOIN other_file_mapped ON main.file.path = other_file_mapped.mapped_path
732+
LEFT JOIN other_db.tracer ON other_file_mapped.other_file_id = other_db.tracer.file_id
733+
WHERE COALESCE(main.tracer.tracer, '') != COALESCE(other_db.tracer.tracer, '')
734+
""") as cur:
735+
conflicts = list(cur)
736+
if conflicts:
737+
path, this_tracer, other_tracer = conflicts[0]
768738
raise DataError(
769739
"Conflicting file tracer name for '{}': {!r} vs {!r}".format(
770740
path, this_tracer, other_tracer,
771741
),
772742
)
773-
tracer_map[path] = other_tracer
774743

775-
# Prepare arc and line rows to be inserted by converting the file
776-
# and context strings with integer ids. Then use the efficient
777-
# `executemany()` to insert all rows at once.
744+
# Insert missing files from other_db (with map_path applied)
745+
con.execute_void("""
746+
INSERT OR IGNORE INTO main.file (path)
747+
SELECT DISTINCT mapped_path FROM other_file_mapped
748+
""")
778749

779-
if arcs:
780-
self._choose_lines_or_arcs(arcs=True)
750+
# Insert missing contexts from other_db
751+
con.execute_void("""
752+
INSERT OR IGNORE INTO main.context (context)
753+
SELECT context FROM other_db.context
754+
""")
781755

782-
arc_rows = [
783-
(file_ids[file], context_ids[context], fromno, tono)
784-
for file, context, fromno, tono in arcs
785-
]
756+
# Update file_map with any new files
757+
with con.execute("select id, path from file") as cur:
758+
self._file_map.update({path: id for id, path in cur})
786759

787-
# Write the combined data.
788-
con.executemany_void(
789-
"insert or ignore into arc " +
790-
"(file_id, context_id, fromno, tono) values (?, ?, ?, ?)",
791-
arc_rows,
792-
)
760+
with con.execute("""
761+
SELECT
762+
EXISTS(SELECT 1 FROM other_db.arc),
763+
EXISTS(SELECT 1 FROM other_db.line_bits)
764+
""") as cur:
765+
has_arcs, has_lines = cur.fetchone()
793766

794-
if lines:
767+
# Handle arcs if present in other_db
768+
if has_arcs:
769+
self._choose_lines_or_arcs(arcs=True)
770+
con.execute_void("""
771+
INSERT OR IGNORE INTO main.arc (file_id, context_id, fromno, tono)
772+
SELECT
773+
main_file.id,
774+
main_context.id,
775+
other_arc.fromno,
776+
other_arc.tono
777+
FROM other_db.arc AS other_arc
778+
INNER JOIN other_file_mapped ON other_arc.file_id = other_file_mapped.other_file_id
779+
INNER JOIN other_db.context AS other_context ON other_arc.context_id = other_context.id
780+
INNER JOIN main.file AS main_file ON other_file_mapped.mapped_path = main_file.path
781+
INNER JOIN main.context AS main_context ON other_context.context = main_context.context
782+
""")
783+
784+
# Handle line_bits if present in other_db
785+
if has_lines:
795786
self._choose_lines_or_arcs(lines=True)
796787

797-
for (file, context), numbits in lines.items():
798-
with con.execute(
799-
"select numbits from line_bits where file_id = ? and context_id = ?",
800-
(file_ids[file], context_ids[context]),
801-
) as cur:
802-
existing = list(cur)
803-
if existing:
804-
lines[(file, context)] = numbits_union(numbits, existing[0][0])
805-
806-
con.executemany_void(
807-
"insert or replace into line_bits " +
808-
"(file_id, context_id, numbits) values (?, ?, ?)",
809-
[
810-
(file_ids[file], context_ids[context], numbits)
811-
for (file, context), numbits in lines.items()
812-
],
813-
)
814-
815-
con.executemany_void(
816-
"insert or ignore into tracer (file_id, tracer) values (?, ?)",
817-
[(file_ids[filename], tracer) for filename, tracer in tracer_map.items()],
818-
)
788+
# Handle line_bits by aggregating other_db data by mapped target,
789+
# then inserting/updating
790+
con.execute_void("""
791+
INSERT OR REPLACE INTO main.line_bits (file_id, context_id, numbits)
792+
SELECT
793+
main_file.id,
794+
main_context.id,
795+
numbits_union(
796+
COALESCE((
797+
SELECT numbits FROM main.line_bits
798+
WHERE file_id = main_file.id AND context_id = main_context.id
799+
), X''),
800+
aggregated.combined_numbits
801+
)
802+
FROM (
803+
SELECT
804+
other_file_mapped.mapped_path,
805+
other_context.context,
806+
numbits_union_agg(other_line_bits.numbits) as combined_numbits
807+
FROM other_db.line_bits AS other_line_bits
808+
INNER JOIN other_file_mapped ON other_line_bits.file_id = other_file_mapped.other_file_id
809+
INNER JOIN other_db.context AS other_context ON other_line_bits.context_id = other_context.id
810+
GROUP BY other_file_mapped.mapped_path, other_context.context
811+
) AS aggregated
812+
INNER JOIN main.file AS main_file ON aggregated.mapped_path = main_file.path
813+
INNER JOIN main.context AS main_context ON aggregated.context = main_context.context
814+
""")
815+
816+
# Insert tracers from other_db (avoiding conflicts we already checked)
817+
con.execute_void("""
818+
INSERT OR IGNORE INTO main.tracer (file_id, tracer)
819+
SELECT
820+
main_file.id,
821+
other_tracer.tracer
822+
FROM other_db.tracer AS other_tracer
823+
INNER JOIN other_file_mapped ON other_tracer.file_id = other_file_mapped.other_file_id
824+
INNER JOIN main.file AS main_file ON other_file_mapped.mapped_path = main_file.path
825+
""")
819826

820827
if not self._no_disk:
821828
# Update all internal cache data.

0 commit comments

Comments
 (0)