"""One-shot data backfill for the schema change in 0004: 1. Populate `Submission.canonical_email` for every existing row by deriving it from `submitted_by.email` (OAuth) or `guest_email` (guest) and running through the same normaliser the live `save()` uses. 2. Re-normalise every `VerifiedEmail.email` already in the table. Rows that collapse to the same canonical form are deduped: we keep the row with the most recent `validated_at` and delete the others. Defensive: both passes use `update_fields=` and `update_or_create`-style logic so re-running the migration is a no-op once it's been applied. """ from django.db import migrations def _normalize_email(email): if not email or "@" not in email: return (email or "").lower() local, _, domain = email.lower().rpartition("@") if "+" in local: local = local.split("+", 1)[0] return f"{local}@{domain}" def forward(apps, schema_editor): Submission = apps.get_model("submissions", "Submission") VerifiedEmail = apps.get_model("submissions", "VerifiedEmail") User = apps.get_model("auth", "User") # ---- Submission.canonical_email ----------------------------------------- # Pull all related users up front so we don't do an O(N) round-trip # per submission. user_emails = dict( User.objects.exclude(email="").values_list("pk", "email") ) to_update = [] for sub in Submission.objects.all().only( "pk", "submitted_by_id", "guest_email", "canonical_email" ): owner_email = "" if sub.submitted_by_id and user_emails.get(sub.submitted_by_id): owner_email = user_emails[sub.submitted_by_id] elif sub.guest_email: owner_email = sub.guest_email new = _normalize_email(owner_email) if new != sub.canonical_email: sub.canonical_email = new to_update.append(sub) if to_update: Submission.objects.bulk_update(to_update, ["canonical_email"], batch_size=500) # ---- VerifiedEmail re-normalisation + dedup ---------------------------- # First pass: pick the surviving row per normalised form (most recent # validated_at wins). Delete the losers. survivors: dict[str, tuple[int, object]] = {} # norm -> (pk, validated_at) for row in VerifiedEmail.objects.all().only("pk", "email", "validated_at"): norm = _normalize_email(row.email) if not norm: row.delete() continue prev = survivors.get(norm) if prev is None: survivors[norm] = (row.pk, row.validated_at) else: prev_pk, prev_at = prev if row.validated_at > prev_at: VerifiedEmail.objects.filter(pk=prev_pk).delete() survivors[norm] = (row.pk, row.validated_at) else: row.delete() # Second pass: rewrite the surviving row's email to its normalised form # (no-op when already normalised; safe because all duplicates are gone). for norm, (pk, _at) in survivors.items(): VerifiedEmail.objects.filter(pk=pk).update(email=norm) def reverse(apps, schema_editor): """The forward pass is a derived backfill; there's nothing meaningful to undo. Leaving rows alone is the right thing on rollback.""" class Migration(migrations.Migration): dependencies = [ ("submissions", "0004_email_normalization"), ] operations = [ migrations.RunPython(forward, reverse), ]