Files
hamprint/apps/submissions/migrations/0005_normalize_existing_data.py

93 lines
3.4 KiB
Python

"""One-shot data backfill for the schema change in 0004:
1. Populate `Submission.canonical_email` for every existing row by deriving
it from `submitted_by.email` (OAuth) or `guest_email` (guest) and
running through the same normaliser the live `save()` uses.
2. Re-normalise every `VerifiedEmail.email` already in the table. Rows that
collapse to the same canonical form are deduped: we keep the row with
the most recent `validated_at` and delete the others.
Defensive: both passes use `update_fields=` and `update_or_create`-style
logic so re-running the migration is a no-op once it's been applied.
"""
from django.db import migrations
def _normalize_email(email):
if not email or "@" not in email:
return (email or "").lower()
local, _, domain = email.lower().rpartition("@")
if "+" in local:
local = local.split("+", 1)[0]
return f"{local}@{domain}"
def forward(apps, schema_editor):
Submission = apps.get_model("submissions", "Submission")
VerifiedEmail = apps.get_model("submissions", "VerifiedEmail")
User = apps.get_model("auth", "User")
# ---- Submission.canonical_email -----------------------------------------
# Pull all related users up front so we don't do an O(N) round-trip
# per submission.
user_emails = dict(
User.objects.exclude(email="").values_list("pk", "email")
)
to_update = []
for sub in Submission.objects.all().only(
"pk", "submitted_by_id", "guest_email", "canonical_email"
):
owner_email = ""
if sub.submitted_by_id and user_emails.get(sub.submitted_by_id):
owner_email = user_emails[sub.submitted_by_id]
elif sub.guest_email:
owner_email = sub.guest_email
new = _normalize_email(owner_email)
if new != sub.canonical_email:
sub.canonical_email = new
to_update.append(sub)
if to_update:
Submission.objects.bulk_update(to_update, ["canonical_email"], batch_size=500)
# ---- VerifiedEmail re-normalisation + dedup ----------------------------
# First pass: pick the surviving row per normalised form (most recent
# validated_at wins). Delete the losers.
survivors: dict[str, tuple[int, object]] = {} # norm -> (pk, validated_at)
for row in VerifiedEmail.objects.all().only("pk", "email", "validated_at"):
norm = _normalize_email(row.email)
if not norm:
row.delete()
continue
prev = survivors.get(norm)
if prev is None:
survivors[norm] = (row.pk, row.validated_at)
else:
prev_pk, prev_at = prev
if row.validated_at > prev_at:
VerifiedEmail.objects.filter(pk=prev_pk).delete()
survivors[norm] = (row.pk, row.validated_at)
else:
row.delete()
# Second pass: rewrite the surviving row's email to its normalised form
# (no-op when already normalised; safe because all duplicates are gone).
for norm, (pk, _at) in survivors.items():
VerifiedEmail.objects.filter(pk=pk).update(email=norm)
def reverse(apps, schema_editor):
"""The forward pass is a derived backfill; there's nothing meaningful
to undo. Leaving rows alone is the right thing on rollback."""
class Migration(migrations.Migration):
dependencies = [
("submissions", "0004_email_normalization"),
]
operations = [
migrations.RunPython(forward, reverse),
]