93 lines
3.4 KiB
Python
93 lines
3.4 KiB
Python
"""One-shot data backfill for the schema change in 0004:
|
|
|
|
1. Populate `Submission.canonical_email` for every existing row by deriving
|
|
it from `submitted_by.email` (OAuth) or `guest_email` (guest) and
|
|
running through the same normaliser the live `save()` uses.
|
|
2. Re-normalise every `VerifiedEmail.email` already in the table. Rows that
|
|
collapse to the same canonical form are deduped: we keep the row with
|
|
the most recent `validated_at` and delete the others.
|
|
|
|
Defensive: both passes use `update_fields=` and `update_or_create`-style
|
|
logic so re-running the migration is a no-op once it's been applied.
|
|
"""
|
|
|
|
from django.db import migrations
|
|
|
|
|
|
def _normalize_email(email):
|
|
if not email or "@" not in email:
|
|
return (email or "").lower()
|
|
local, _, domain = email.lower().rpartition("@")
|
|
if "+" in local:
|
|
local = local.split("+", 1)[0]
|
|
return f"{local}@{domain}"
|
|
|
|
|
|
def forward(apps, schema_editor):
|
|
Submission = apps.get_model("submissions", "Submission")
|
|
VerifiedEmail = apps.get_model("submissions", "VerifiedEmail")
|
|
User = apps.get_model("auth", "User")
|
|
|
|
# ---- Submission.canonical_email -----------------------------------------
|
|
# Pull all related users up front so we don't do an O(N) round-trip
|
|
# per submission.
|
|
user_emails = dict(
|
|
User.objects.exclude(email="").values_list("pk", "email")
|
|
)
|
|
to_update = []
|
|
for sub in Submission.objects.all().only(
|
|
"pk", "submitted_by_id", "guest_email", "canonical_email"
|
|
):
|
|
owner_email = ""
|
|
if sub.submitted_by_id and user_emails.get(sub.submitted_by_id):
|
|
owner_email = user_emails[sub.submitted_by_id]
|
|
elif sub.guest_email:
|
|
owner_email = sub.guest_email
|
|
new = _normalize_email(owner_email)
|
|
if new != sub.canonical_email:
|
|
sub.canonical_email = new
|
|
to_update.append(sub)
|
|
if to_update:
|
|
Submission.objects.bulk_update(to_update, ["canonical_email"], batch_size=500)
|
|
|
|
# ---- VerifiedEmail re-normalisation + dedup ----------------------------
|
|
# First pass: pick the surviving row per normalised form (most recent
|
|
# validated_at wins). Delete the losers.
|
|
survivors: dict[str, tuple[int, object]] = {} # norm -> (pk, validated_at)
|
|
for row in VerifiedEmail.objects.all().only("pk", "email", "validated_at"):
|
|
norm = _normalize_email(row.email)
|
|
if not norm:
|
|
row.delete()
|
|
continue
|
|
prev = survivors.get(norm)
|
|
if prev is None:
|
|
survivors[norm] = (row.pk, row.validated_at)
|
|
else:
|
|
prev_pk, prev_at = prev
|
|
if row.validated_at > prev_at:
|
|
VerifiedEmail.objects.filter(pk=prev_pk).delete()
|
|
survivors[norm] = (row.pk, row.validated_at)
|
|
else:
|
|
row.delete()
|
|
|
|
# Second pass: rewrite the surviving row's email to its normalised form
|
|
# (no-op when already normalised; safe because all duplicates are gone).
|
|
for norm, (pk, _at) in survivors.items():
|
|
VerifiedEmail.objects.filter(pk=pk).update(email=norm)
|
|
|
|
|
|
def reverse(apps, schema_editor):
|
|
"""The forward pass is a derived backfill; there's nothing meaningful
|
|
to undo. Leaving rows alone is the right thing on rollback."""
|
|
|
|
|
|
class Migration(migrations.Migration):
|
|
|
|
dependencies = [
|
|
("submissions", "0004_email_normalization"),
|
|
]
|
|
|
|
operations = [
|
|
migrations.RunPython(forward, reverse),
|
|
]
|