Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,14 @@ next-env.d.ts
*/tiles/*

y/
cleanandgreenphilly_gcloud_bucket/
cleanandgreenphilly_gcloud_bucket/

# personal service account key for GCP
data/src/app/service-account-key.json

# compiled python files
*.pyc

# awkde build files
data/src/awkde/build/
tmp/
11 changes: 10 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
{
"editor.formatOnSave": true,
"editor.defaultFormatter": "esbenp.prettier-vscode"
"editor.defaultFormatter": "esbenp.prettier-vscode",
"python.analysis.extraPaths": ["${workspaceFolder}/data/src","${workspaceFolder}/data/src/awkde"],
"python.testing.pytestEnabled": true,
"python.testing.cwd": "${workspaceFolder}/data/src",
"[python]": {
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit"
},
}
}
4 changes: 3 additions & 1 deletion data/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@ FROM python:3.11.4
# Set the working directory in the container
WORKDIR /usr/src/app

# Install system dependencies for GDAL and Tippecanoe
# Install system dependencies for GDAL and Tippecanoe
# postgresql-client for psql and pg_dump executables for backups
RUN apt-get update && apt-get install -y \
libgdal-dev \
gcc \
git \
build-essential \
libsqlite3-dev \
zlib1g-dev \
postgresql-client \
&& rm -rf /var/lib/apt/lists/*

# Set GDAL environment variables
Expand Down
3 changes: 3 additions & 0 deletions data/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ services:
- CLEAN_GREEN_GOOGLE_KEY
- PYTHONUNBUFFERED=1
- GOOGLE_CLOUD_BUCKET_NAME
- CAGP_SLACK_API_TOKEN
volumes:
- ./src:/usr/src/app
- ~/.config/gcloud/application_default_credentials.json:/app/service-account-key.json
extra_hosts:
- host.docker.internal:host-gateway

formatter:
build: .
Expand Down
5 changes: 5 additions & 0 deletions data/src/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ psycopg2-binary = "*"
geoalchemy2 = "*"
mapbox = "*"
google-cloud-storage = "*"
pydantic = "==1.10.12"
data-diff = {extras = ["postgresql"], version = "*"}
future = "*"
slack-sdk = "*"
pytest = "*"

[dev-packages]

Expand Down
1,416 changes: 960 additions & 456 deletions data/src/Pipfile.lock

Large diffs are not rendered by default.

99 changes: 99 additions & 0 deletions data/src/classes/backup_archive_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import logging as log
import subprocess
from datetime import datetime, timedelta

import sqlalchemy as sa
from config.config import log_level, max_backup_schema_days
from config.psql import conn, local_engine, url
from data_utils.utils import mask_password
from sqlalchemy import inspect

log.basicConfig(level=log_level)

backup_schema_name: str = "backup_"
""" the prefix for the backup schemas """

date_time_format: str = "%Y_%m_%dt%H_%M_%S"
""" the datetime format for the backup schema names """


class BackupArchiveDatabase:
"""
Class to manage creating a backup of the public schema before the etl refresh is run. After the etl job and data differences are reported, this class moves the current backup schema to a timestamped backup and prunes older backup schemas.
"""

def __init__(self):
self.timestamp_string = datetime.now().strftime(date_time_format)
self.backup_schema_archive_name = backup_schema_name + self.timestamp_string

def backup_schema(self):
"""
backup the whole public schema to another schema in the same db.
pgdump the public schema, replace public schema name with backup schema name, clean up the special column types, and import it with psql in one piped command
"""

pgdump_command = (
# first, dump the schema only where we can safely replace all 'public' strings with 'backup_'
"pg_dump "
+ url
+ " -s --schema public | sed 's/public/"
+ backup_schema_name
+ "/g' | sed 's/"
+ backup_schema_name
+ ".geometry/public.geometry/' | sed 's/"
+ backup_schema_name
+ ".spatial_ref_sys/public.spatial_ref_sys/' | psql -v ON_ERROR_STOP=1 "
+ url
+ " > /dev/null "
# then dump the data only and substitute the word public only where it is in DDL, not in the data
+ " && pg_dump "
+ url
+ " -a --schema public | sed 's/COPY public./COPY "
+ backup_schema_name
+ "./g' | sed 's/"
+ backup_schema_name
+ ".geometry/public.geometry/' | sed 's/"
+ backup_schema_name
+ ".spatial_ref_sys/public.spatial_ref_sys/' | psql -v ON_ERROR_STOP=1 "
+ url
+ " > /dev/null "
)
log.debug(mask_password(pgdump_command))
complete_process = subprocess.run(pgdump_command, check=False, shell=True)

if complete_process.returncode != 0 or complete_process.stderr:
raise RuntimeError(
"pg_dump command "
+ mask_password(pgdump_command)
+ " did not exit with success. "
+ complete_process.stderr.decode()
)

def archive_backup_schema(self):
"""
mv backup_ schema to "backup_" + backup_timestamp
"""
sql = (
"ALTER SCHEMA "
+ backup_schema_name
+ " RENAME TO "
+ self.backup_schema_archive_name
)
log.debug(sql)
conn.execute(sa.DDL(sql))

def prune_old_archives(self):
"""
drop backup schemas that are too old
"""
# list all backup schemas
schemas = inspect(local_engine).get_schema_names()
cutoff = datetime.now() - timedelta(days=max_backup_schema_days)
for schema in schemas:
if schema.startswith(backup_schema_name):
timestamp = schema.replace(backup_schema_name, "")
backed_up_time = datetime.strptime(timestamp, date_time_format)
if backed_up_time < cutoff:
sql = "drop schema " + schema + " cascade"
log.debug(sql)
conn.execute(sa.DDL(sql))
Loading