|
5 | 5 |
|
6 | 6 | package marquez.db;
|
7 | 7 |
|
| 8 | +import java.time.Instant; |
8 | 9 | import java.util.Collection;
|
9 | 10 | import java.util.List;
|
10 | 11 | import java.util.Optional;
|
11 | 12 | import java.util.Set;
|
12 | 13 | import java.util.UUID;
|
| 14 | +import javax.validation.constraints.NotNull; |
| 15 | +import marquez.common.models.DatasetName; |
| 16 | +import marquez.common.models.JobName; |
| 17 | +import marquez.common.models.NamespaceName; |
| 18 | +import marquez.common.models.RunId; |
13 | 19 | import marquez.db.mappers.DatasetDataMapper;
|
14 | 20 | import marquez.db.mappers.JobDataMapper;
|
15 | 21 | import marquez.db.mappers.JobRowMapper;
|
16 | 22 | import marquez.db.mappers.RunMapper;
|
| 23 | +import marquez.db.mappers.UpstreamRunRowMapper; |
17 | 24 | import marquez.service.models.DatasetData;
|
18 | 25 | import marquez.service.models.JobData;
|
19 | 26 | import marquez.service.models.Run;
|
|
25 | 32 | @RegisterRowMapper(JobDataMapper.class)
|
26 | 33 | @RegisterRowMapper(RunMapper.class)
|
27 | 34 | @RegisterRowMapper(JobRowMapper.class)
|
| 35 | +@RegisterRowMapper(UpstreamRunRowMapper.class) |
28 | 36 | public interface LineageDao {
|
29 | 37 |
|
| 38 | + public record JobSummary(NamespaceName namespace, JobName name, UUID version) {} |
| 39 | + |
| 40 | + public record RunSummary(RunId id, Instant start, Instant end, String status) {} |
| 41 | + |
| 42 | + public record DatasetSummary( |
| 43 | + NamespaceName namespace, DatasetName name, UUID version, RunId producedByRunId) {} |
| 44 | + |
| 45 | + public record UpstreamRunRow(JobSummary job, RunSummary run, DatasetSummary input) {} |
| 46 | + |
30 | 47 | /**
|
31 | 48 | * Fetch all of the jobs that consume or produce the datasets that are consumed or produced by the
|
32 | 49 | * input jobIds. This returns a single layer from the BFS using datasets as edges. Jobs that have
|
@@ -154,4 +171,51 @@ SELECT DISTINCT on(r.job_name, r.namespace_name) r.*, jv.version as job_version
|
154 | 171 | WHERE j.uuid in (<jobUuid>) OR j.symlink_target_uuid IN (<jobUuid>)
|
155 | 172 | ORDER BY r.job_name, r.namespace_name, created_at DESC""")
|
156 | 173 | List<Run> getCurrentRuns(@BindList Collection<UUID> jobUuid);
|
| 174 | + |
| 175 | + @SqlQuery( |
| 176 | + """ |
| 177 | + WITH RECURSIVE |
| 178 | + upstream_runs( |
| 179 | + r_uuid, -- run uuid |
| 180 | + dataset_uuid, dataset_version_uuid, dataset_namespace, dataset_name, -- input dataset version to the run |
| 181 | + u_r_uuid, -- upstream run that produced that dataset version |
| 182 | + depth -- current depth of traversal |
| 183 | + ) AS ( |
| 184 | +
|
| 185 | + -- initial case: find the inputs of the initial runs |
| 186 | + select r.uuid, |
| 187 | + dv.dataset_uuid, dv."version", dv.namespace_name, dv.dataset_name, |
| 188 | + dv.run_uuid, |
| 189 | + 0 AS depth -- starts at 0 |
| 190 | + FROM (SELECT :runId::uuid AS uuid) r -- initial run |
| 191 | + LEFT JOIN runs_input_mapping rim ON rim.run_uuid = r.uuid |
| 192 | + LEFT JOIN dataset_versions dv ON dv.uuid = rim.dataset_version_uuid |
| 193 | +
|
| 194 | + UNION |
| 195 | +
|
| 196 | + -- recursion: find the inputs of the inputs found on the previous iteration and increase depth to know when to stop |
| 197 | + SELECT |
| 198 | + ur.u_r_uuid, |
| 199 | + dv2.dataset_uuid, dv2."version", dv2.namespace_name, dv2.dataset_name, |
| 200 | + dv2.run_uuid, |
| 201 | + ur.depth + 1 AS depth -- increase depth to check end condition |
| 202 | + FROM upstream_runs ur |
| 203 | + LEFT JOIN runs_input_mapping rim2 ON rim2.run_uuid = ur.u_r_uuid |
| 204 | + LEFT JOIN dataset_versions dv2 ON dv2.uuid = rim2.dataset_version_uuid |
| 205 | + -- end condition of the recursion: no input or depth is over the maximum set |
| 206 | + -- also avoid following cycles (ex: merge statement) |
| 207 | + WHERE ur.u_r_uuid IS NOT NULL AND ur.u_r_uuid <> ur.r_uuid AND depth < :depth |
| 208 | + ) |
| 209 | +
|
| 210 | + -- present the result: use Distinct as we may have traversed the same edge multiple times if there are diamonds in the graph. |
| 211 | + SELECT * FROM ( -- we need the extra statement to sort after the DISTINCT |
| 212 | + SELECT DISTINCT ON (upstream_runs.r_uuid, upstream_runs.dataset_version_uuid, upstream_runs.u_r_uuid) |
| 213 | + upstream_runs.*, |
| 214 | + r.started_at, r.ended_at, r.current_run_state as state, |
| 215 | + r.job_uuid, r.job_version_uuid, r.namespace_name as job_namespace, r.job_name |
| 216 | + FROM upstream_runs, runs r WHERE upstream_runs.r_uuid = r.uuid |
| 217 | + ) sub |
| 218 | + ORDER BY depth ASC, job_name ASC; |
| 219 | + """) |
| 220 | + List<UpstreamRunRow> getUpstreamRuns(@NotNull UUID runId, int depth); |
157 | 221 | }
|
0 commit comments