CodeForPhilly · nlebovits · May 28, 2024 · May 10, 2024 · May 12, 2024 · May 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -45,4 +45,14 @@ next-env.d.ts
 */tiles/*
 
 y/
-cleanandgreenphilly_gcloud_bucket/
+cleanandgreenphilly_gcloud_bucket/
+
+# personal service account key for GCP
+data/src/app/service-account-key.json
+
+# compiled python files
+*.pyc
+
+# awkde build files
+data/src/awkde/build/
+tmp/
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,4 +1,13 @@
 {
   "editor.formatOnSave": true,
-  "editor.defaultFormatter": "esbenp.prettier-vscode"
+  "editor.defaultFormatter": "esbenp.prettier-vscode",
+  "python.analysis.extraPaths": ["${workspaceFolder}/data/src","${workspaceFolder}/data/src/awkde"],
+  "python.testing.pytestEnabled": true,
+  "python.testing.cwd": "${workspaceFolder}/data/src",
+  "[python]": {
+    "editor.formatOnSave": true,
+    "editor.codeActionsOnSave": {
+        "source.organizeImports": "explicit"
+    },
+  }
 }
diff --git a/data/Dockerfile b/data/Dockerfile
@@ -4,14 +4,16 @@ FROM python:3.11.4
 # Set the working directory in the container
 WORKDIR /usr/src/app
 
-# Install system dependencies for GDAL and Tippecanoe
+# Install system dependencies for GDAL and Tippecanoe 
+# postgresql-client for psql and pg_dump executables for backups
 RUN apt-get update && apt-get install -y \
     libgdal-dev \
     gcc \
     git \
     build-essential \
     libsqlite3-dev \
     zlib1g-dev \
+    postgresql-client \
     && rm -rf /var/lib/apt/lists/*
 
 # Set GDAL environment variables

diff --git a/data/docker-compose.yml b/data/docker-compose.yml
@@ -9,9 +9,12 @@ services:
       - CLEAN_GREEN_GOOGLE_KEY
       - PYTHONUNBUFFERED=1
       - GOOGLE_CLOUD_BUCKET_NAME
+      - CAGP_SLACK_API_TOKEN
     volumes:
       - ./src:/usr/src/app
       - ~/.config/gcloud/application_default_credentials.json:/app/service-account-key.json
+    extra_hosts: 
+      - host.docker.internal:host-gateway
 
   formatter:
     build: .

diff --git a/data/src/Pipfile b/data/src/Pipfile
@@ -21,6 +21,11 @@ psycopg2-binary = "*"
 geoalchemy2 = "*"
 mapbox = "*"
 google-cloud-storage = "*"
+pydantic = "==1.10.12"
+data-diff = {extras = ["postgresql"], version = "*"}
+future = "*"
+slack-sdk = "*"
+pytest = "*"
 
 [dev-packages]
 

diff --git a/data/src/Pipfile.lock b/data/src/Pipfile.lock
diff --git a/data/src/classes/backup_archive_database.py b/data/src/classes/backup_archive_database.py
@@ -0,0 +1,99 @@
+import logging as log
+import subprocess
+from datetime import datetime, timedelta
+
+import sqlalchemy as sa
+from config.config import log_level, max_backup_schema_days
+from config.psql import conn, local_engine, url
+from data_utils.utils import mask_password
+from sqlalchemy import inspect
+
+log.basicConfig(level=log_level)
+
+backup_schema_name: str = "backup_"
+""" the prefix for the backup schemas """
+
+date_time_format: str = "%Y_%m_%dt%H_%M_%S"
+""" the datetime format for the backup schema names """
+
+
+class BackupArchiveDatabase:
+    """
+    Class to manage creating a backup of the public schema before the etl refresh is run.  After the etl job and data differences are reported, this class moves the current backup schema to a timestamped backup and prunes older backup schemas.
+    """
+
+    def __init__(self):
+        self.timestamp_string = datetime.now().strftime(date_time_format)
+        self.backup_schema_archive_name = backup_schema_name + self.timestamp_string
+
+    def backup_schema(self):
+        """
+        backup the whole public schema to another schema in the same db.
+        pgdump the public schema, replace public schema name with backup schema name, clean up the special column types, and import it with psql in one piped command
+        """
+
+        pgdump_command = (
+            # first, dump the schema only where we can safely replace all 'public' strings with 'backup_'
+            "pg_dump "
+            + url
+            + " -s --schema public | sed 's/public/"
+            + backup_schema_name
+            + "/g' | sed 's/"
+            + backup_schema_name
+            + ".geometry/public.geometry/' | sed 's/"
+            + backup_schema_name
+            + ".spatial_ref_sys/public.spatial_ref_sys/' | psql -v ON_ERROR_STOP=1 "
+            + url
+            + " > /dev/null "
+            # then dump the data only and substitute the word public only where it is in DDL, not in the data
+            + " && pg_dump "
+            + url
+            + " -a --schema public | sed 's/COPY public./COPY "
+            + backup_schema_name
+            + "./g' | sed 's/"
+            + backup_schema_name
+            + ".geometry/public.geometry/' | sed 's/"
+            + backup_schema_name
+            + ".spatial_ref_sys/public.spatial_ref_sys/' | psql -v ON_ERROR_STOP=1 "
+            + url
+            + " > /dev/null "
+        )
+        log.debug(mask_password(pgdump_command))
+        complete_process = subprocess.run(pgdump_command, check=False, shell=True)
+
+        if complete_process.returncode != 0 or complete_process.stderr:
+            raise RuntimeError(
+                "pg_dump command "
+                + mask_password(pgdump_command)
+                + " did not exit with success. "
+                + complete_process.stderr.decode()
+            )
+
+    def archive_backup_schema(self):
+        """
+        mv backup_ schema to "backup_" + backup_timestamp
+        """
+        sql = (
+            "ALTER SCHEMA "
+            + backup_schema_name
+            + " RENAME TO "
+            + self.backup_schema_archive_name
+        )
+        log.debug(sql)
+        conn.execute(sa.DDL(sql))
+
+    def prune_old_archives(self):
+        """
+        drop backup schemas that are too old
+        """
+        # list all backup schemas
+        schemas = inspect(local_engine).get_schema_names()
+        cutoff = datetime.now() - timedelta(days=max_backup_schema_days)
+        for schema in schemas:
+            if schema.startswith(backup_schema_name):
+                timestamp = schema.replace(backup_schema_name, "")
+                backed_up_time = datetime.strptime(timestamp, date_time_format)
+                if backed_up_time < cutoff:
+                    sql = "drop schema " + schema + " cascade"
+                    log.debug(sql)
+                    conn.execute(sa.DDL(sql))