add the script for country name validation

This commit is contained in:
2026-03-11 17:15:34 +01:00
parent 20802fd8af
commit cbe1fa85b6

View File

@@ -0,0 +1,120 @@
#!/usr/bin/env python3
import csv
import re
import pycountry
COUNTRY_LIST = ['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos', 'Colombia', 'Comoros', 'Congo', 'Congo', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czechia', "Côte d'Ivoire", 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Falkland Islands', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Heard Island and McDonald Islands', 'Holy See', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Korea', 'Korea', 'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic", 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania', 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestine, State of', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Republic of North Macedonia', 'Romania', 'Russian Federation', 'Rwanda', 'Réunion', 'Saint Barthélemy', 'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Martin', 'Saint Pierre and Miquelon', 'Saint Vincent and the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Sint Maarten', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Georgia and the South Sandwich Islands', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Svalbard and Jan Mayen', 'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Taiwan', 'Tajikistan', 'Tanzania, United Republic of', 'Thailand', 'Timor-Leste', 'Togo', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom of Great Britain and Northern Ireland', 'United States Minor Outlying Islands', 'United States of America', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela', 'Viet Nam', 'Virgin Islands', 'Virgin Islands', 'Wallis and Futuna', 'Western Sahara', 'Yemen', 'Zambia', 'Zimbabwe', 'Åland Islands']
def is_alpha2(code: str) -> bool:
return bool(re.fullmatch(r"[A-Z]{2}", code))
def get_country_iso_code(country_name: str) -> str | None:
"""Return the 2-letter ISO representation for a country (backend logic)."""
try:
country = pycountry.countries.get(name=country_name)
if country:
return country.alpha_2
country = pycountry.countries.get(official_name=country_name)
if country:
return country.alpha_2
country = pycountry.countries.search_fuzzy(country_name)
if country:
return country[0].alpha_2
except Exception:
return None
return None
def planned_migration_code(label: str) -> str | None:
if label == "Korea":
return "KR"
if label == "Virgin Islands":
return "VI"
if label == "Sint Maarten":
return "SX"
if label == "Saint Martin":
return "MF"
return get_country_iso_code(label)
def main():
# Column D: accumulate multiple distinct hardcoded names per code using backend logic
backend_code_to_labels = {}
for label in COUNTRY_LIST:
code = get_country_iso_code(label)
if not code or not is_alpha2(code):
continue
existing = backend_code_to_labels.get(code)
if not existing:
backend_code_to_labels[code] = label
else:
parts = [p.strip() for p in existing.split(";")]
if label not in parts:
parts.append(label)
backend_code_to_labels[code] = "; ".join(parts)
planned_code_to_label = {}
planned_unmatched = []
for label in COUNTRY_LIST:
code = planned_migration_code(label)
if not code or not is_alpha2(code):
planned_unmatched.append(label)
continue
if code not in planned_code_to_label:
planned_code_to_label[code] = label
current_set = set(COUNTRY_LIST)
rows = []
for c in pycountry.countries:
code = getattr(c, "alpha_2", None)
if not code or not is_alpha2(code):
continue
iso_canonical = c.name
planned = planned_code_to_label.get(code, "")
backend_maps_to = backend_code_to_labels.get(code, "")
hardcoded_list_contains_iso = "Y" if iso_canonical in current_set else "N"
hard_coded_name_maps_to_code = "Y" if backend_maps_to else "N"
# New check: does canonical ISO name map back to the same code via backend logic?
iso_maps_to = get_country_iso_code(iso_canonical)
iso_canonical_maps_to_same_code = "Y" if iso_maps_to == code else "N"
rows.append((
code,
planned,
iso_canonical,
backend_maps_to,
hardcoded_list_contains_iso,
hard_coded_name_maps_to_code,
iso_canonical_maps_to_same_code,
))
seen = set()
for label in planned_unmatched:
if label in seen:
continue
seen.add(label)
rows.append(("", label, "", "", "", "", ""))
rows.sort(key=lambda r: (r[0] == "", r[0]))
out_path = "countries_iso_vs_hardcoded_planned_migration_v5_utf8_bom.csv"
with open(out_path, "w", newline="", encoding="utf-8-sig") as f:
w = csv.writer(f)
w.writerow([
"alpha2",
"planned_migration",
"iso_short_name_canonical_en",
"how_backend_logic_maps_hardcoded_names_to_country_codes",
"hardcoded_list_contains_iso_canonical",
"hard_coded_name_maps_to_code",
"iso_canonical_maps_to_same_code_via_backend_logic",
])
w.writerows(rows)
print(f"Wrote {len(rows)} rows to {out_path}")
if __name__ == "__main__":
main()