Blob Blame History Raw
#!/usr/bin/perl

binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";

while (<>) {
  ### Drop certain stations
  next if / Platform *;/; # offshort oil platforms
  next if /^LYPZ;/; # buggy duplicate

  ### Whitespace/punctuation cleanup
  s/   */ /g;
  s/\\/\//g;
  s/([^ ])\//$1 \//g;
  s/\/([^ ])/\/ $1/g;
  s/,([^ ])/, $1/g;
  s/ ,/,/g;
  s/[ ,\/]*;/;/g;
  s/; /;/g;

  ### Capitalization, etc
  s/Mc /Mc/g;
  s/ Of / of /g;
  s/([a-z]) D(a |e |el |es |i |o |u |\')/$1 d$2/g;
  s/([a-z]) L(a|es?) /$1 l$2 /g;
  # lowercasify a capital letter after an apostrophe, unless the
  # preceding letter was "d" (eg, "Cote d'Ivoire")
  s/([a-ce-z]\'[A-Z])/\L$1/g; 


  ### Fix incorrect or outdated codes
  if (/;Angelholm;;Sweden;/)   { s/ESDB/ESTA/; }
  if (/;M\. Calamita;;Italy;/) { s/LIRJ/LIRX/; }
  if (/;Yerevan;;Armenia;/)    { s/UGEE/UDYZ/; }
  if (/;Novosibirsk;;Russia;/) { s/UNNN/UNNT/; }
  if (/;Jinan;;China;/)        { s/ZSTN/ZSJN/; }

  ### Fix invalid or incorrect coordinates
  if (/^K3MW;/) { s/;40-26-94N;106-44-95W;/;40-27N;106-45W;/; }
  if (/^KBKB;/) { s/;092-97W;/;093-00W;/; }
  if (/^KBJN;/) { s/;37-37-02;/;37-37-02N;/; }
  if (/^KFHU;/) { s/;46-98N;/;31-35N;/; }
  if (/^KWTR;/) { s/;104-87W;/;105-00W;/; }
  if (/^MMML;/) { s/;117-00W;/;115-14W;/; }
  if (/^MNBL;/) { s/;086-46W;/;083-46W;/; }
  if (/^PGNT;/) { s/;14-96N;/;15-00N;/; }


  ### Fix some country divisions to match FIPS codes
  if (/^EGJA;/) { s/;United Kingdom;/;Guernsey;/; }
  if (/^EGJB;/) { s/;United Kingdom;/;Guernsey;/; }
  if (/^EGJJ;/) { s/;United Kingdom;/;Jersey;/; }
  if (/^EGNS;/) { s/;United Kingdom;/;Isle of Man;/; }
  if (/^EKVG;/) { s/;Denmark;/;Faroe Islands;/; }
  if (/^ENSB;/) { s/;Norway;/;Svalbard;/; }
  if (/^FMCZ;/) { s/;Comoros;/;Mayotte;/; }
  if (/^NLWW;/) { s/;France;/;Wallis and Futuna;/; }
  if (/^PLCH;/) { s/;New Zealand;/;Kiribati;/; }
  if (/^TI..;/) { s/;;Virgin Islands;/;VI;United States;/; }
  if (/^YSNF;/) { s/;Australia;/;Norfolk Island;/; }


  ### Fix some country/state divisions to correct bugs
  if (/^EGYP;/) { s/;South Georgia and the Islands;/;Falkland Islands (Islas Malvinas);/; }
  if (/^HHAS;/) { s/;Ethiopia;/;Eritrea;/; } # 502576
  if (/^LY..;/) { s/;Serbia and Montenegro;/;;/; } # This will force update-locations to figure them out itself
  if (/^NIUE;/) { s/;Cook Islands;/;Niue;/; }
  if (/^NSTU;/) { s/;;United States Minor Outlying Islands;/;AS;United States;/; }
  if (/^NZWD;/) { s/, Antarctic;;New Zealand;/;;Antarctica;/; }
  if (/^PMDY;/) { s/;HI;/;UM;/; }
  if (/^PWAK;/) { s/;GU;/;UM;/; }
  if (/^TKPN;/) { s/;Antigua and Barbuda;/;Saint Kitts and Nevis;/; }
  if (/^YPCC;/) { s/;Christmas Island;/;Cocos (Keeling) Islands;/; }


  ### Fix some spelling mistakes/wackiness/nonstandardnesses. Mostly
  ### alphabetical by station code
  if (/^EDDF;/) { s/ \/ M-Flughafen//; }
  if (/^ETWM;/) { s/-Mil;/ Military Base;/; }
  if (/^FAJS;/) { s/;Johannesburg International Airport;/;O. R. Tambo International Airport;/; } # 533622
  if (/^HESH;/) { s/Sheikhintl/Sheikh Intl/; }
  if (/^HKJK;/) { s/ TWR \/ APP \/ NOF \/ Civil Airlines//; }
  if (/^LBGO;/) { s/Orechovista/Oryakhovitsa/; } # 313655
  if (/^LEJR;/) { s/Fronteraaeropuerto/Frontera Aeropuerto/; }
  if (/^LFPG;/) { s/Paris-Aeroport Charles de Gaulle/Paris, Charles de Gaulle International Airport/; }
  if (/^LIPO;/) { s/Montichia;/Montichiara;/; } # 350945
  if (/^LOAV;/) { s/Lugplatz/Flugplatz/; }
  if (/^MMGL;/) { s/Guadalaj;/Guadalajara;/; }
  if (/^MMMD;/) { s/ lic / /; }
  if (/^MNMG;/) { s/Managua A. C. Sandino/Managua, A. C. Sandino Airport/; }
  if (/^MTPP;/) { s/ \/ Aeroport International/ International Airport/; }
  if (/^MWCR;/) { s/Airportgrand/Airport, Grand/; }
  # 319538 - the entry for OIAG actually has the data for OIAJ
  if (/^OIAG;/) { s/Omidieh/Aghajari/; s/30-46N;049-40E/30-44-44N;049-40-35E/; }
  if (/^OIKB;/) { s/Bandarabbass/Bandar Abbas/; }
  if (/^OINN;/) { s/Noshahr/Now Shahr/; }
  if (/^OITR;/) { s/Orumieh/Orumiyeh/; }
  if (/^PGUM;/) { s/Agana/Hagåtña/; } # to match POP_PLACES
  if (/^SABE;/) { s/Aeroparque Bs\. As\./Buenos Aires, Jorge Newbery/; }
  if (/^SBFZ;/) { s/pinto/Pinto/; }
  if (/^SPHY;/) { s/Andahuayla/Andahuaylas/; }
  if (/^SPIM;/) { s/Aerop\. Internacional Jorgechavez/Jorge Chavez International Airport/; }
  if (/^SVMI;/) { s/Maiquetia Aerop\. Intl\. Simon Bolivar/Simon Bolivar International/; }
  if (/^TAPA;/) { s/Vc /V. C. /; }
  if (/^TKPN;/) { s/Newcast;/Newcastle;/; }
  if (/^UBBG;/) { s/Gyanca/Gyandzha/; }
  if (/^UKDR;/) { s/Krivyy/Kryvyy/; }


  ### "Move" some stations to keep them from matching irrelevant cities
  if (/^VTBD;/) { s/Don Muang/Bangkok/; }


  ### Untranslate/unabbreviate the word "Airport". (The names in
  ### nsd_cccc.txt don't seem to be especially close to
  ### correct/official, so this is a net win.
  s/Aerop\. Internacional ([^,;]*)/$1 International Airport/;
  s/Aeropuerto[^ ]* ([^,;]*)/$1 Airport/;
  s/Aeroporto* d[ea] ([^,;]*)/$1 Airport/;
  s/[ -]Aero(|\.|drome|porto?|-Porto|puerto)( |;)/ Airport$2/;
  s/Air(-Port|p\.)/Airport/;
  s/Civ \/ (Mil|Afb)/Airport/;
  s/( \/)? Civ(|il|ilian);/$1 Airport;/;
  s/Lufthavn/Airport/;
  s/Int\'?l\.?/International/;
  s/Int\./International/;
  s/Inter-National/International/;
  s/Internationalairport;/International Airport;/;
  s/International;/International Airport;/;
  s/Airport ([A-Z])/Airport, $1/;
  # Change "Foo / Airport" to "Foo Airport"
  s/;([^;]*)(,| \/) (International Airport|Airport);/;$1 $3;/;
  # And "Foo / Bar Airport" to "Foo, Bar Airport"
  s/;([^;\/,]*) \/ ([^;\/,]* Airport)/;$1, $2/;

  s/,? ([a-z][a-z]*-)?afb/ Air Force Base/i;
  s/ ([A-Z][a-z]*-)?Ab;/ Air Base;/;
  s/Usa . Af/US Air Force Base/;
  s/Usaf/US Air Force Base/;
  s/Air Force Operated Base In Foreign Country/Air Force Base/;
  s/ (Can-)?Mil(\.|itary);/ Military Base;/;

  s/Obs(\.|erv\.|ervatory|ervatorio)/Observatory/;

  # US National Weather Service, but appears not just in /;United States;/
  s/, NWS Office//;
  # Likewise Australian Weather Service (or Automated Weather Station?)
  s/,? Aws;/;/;

  ### Country-specific fixups, sorted alphabetically by country

  if (/;Argentina;/) {
    # Remove province name from location description
    s/, (BA|B\. A\.|CHT|SF);/;/;
  }

  if (/;Australia;/) {
    # ??
    s/ (Amo|Mo);/ Airport;/;
    s/,? M\. O\.?;/;/;

    s/ Ran / Royal Australian Navy /;
  }

  if (/;Austria;/) {
    s/-Flughafen/ Airport/;
    s/Flugplatz/Airport/;
    s/ Am / am /;
    s/ Im / im /;
  }

  if (/;Canada;/) {
    # Remove province/territory name from location description
    s/,? (Alta|B\. C|Man|N\. B|Nfld|N\. S|N\. W\. T|Ont|P\. E\. I|Prince Edward Island|Que|Sask|Y\. T)\.?;/;/;

    # Canadian Department of Agriculture, [Remote] Climate Station
    s/ Cda//i;
    s/ R?CS//;

    s/Airport, [^;]*Station;/Airport;/;

    # /CX../ stations are automated. Maybe we should drop all of them,
    # but for now we'll just drop the ones where there's also a
    # corresponding non-automated station
    if (/^CX(DE|EC|EG|MI|MM|OX|TV|WN|ZU)/) {
      next;
    }
  }

  if (/;Cuba;/) {
    s/, Oriente//;
  }

  if (/;Mexico;/) {
    # Remove state name from location description
    s/,? (Ags|B\. C\. S|Camp|Chis|Coah|Mor|N\. L|Nay|Pue|Q\. Roo|Qro|S\. L\. P|Sin|Son)\.?;/;/;
  }

  if (/;Netherlands;/) {
    s/;([^;]*) Airport, ([^;]*);/;$1, $2 Airport;/;
  }

  if (/;New Caledonia;/) {
    s/ Nlle-Caledonie//;
    s/ Ile [^;]*//;
  }

  if (/;Sweden;/) {
    s/Flygplats/Airport/;
  }

  if (/;United States;/) {
    s/Nexrad/NEXRAD Station/;
  }

  ### Final airport fixing...

  ### The location data for several countries (including all of South
  ### America) uses the convention "City / Airport Name"
  if (/^(AG|AY|DA|EL|FC|FMM|FO|FX|FZ|GA|GM|HE|LI|LJ|MK|MM|MP|OI|PL|S|US|UU|UW|VV|WA|WI|WM|WR|WS|Z)/) {
    s/;([^;]*) \/ ([^;\/]*)/;$1, $2 Airport/;
  }

  ### Some do it backwards
  if (/^(EN|LE|MS|TT)/) {
    s/;([^;]*) \/ ([^;\/]*)/;$1 Airport, $2/;
  }

  ### In some countries, you generally need to prefix the city name to
  ### the airport name
  if (/^(C|ES|GO|LF|LH|LK|LS|LT|LZ)/) {
    s/;([^;]*) \/ ([^;\/]*)/;$1, $1-$2 Airport/;
  }

  # Some of our fixes end up resulting in "Airport Airport"
  s/Airport Airport/Airport/;

  # Remove numbers in "Foo 1", "Foo 2", "Foo Iii", etc
  s/ [123];/;/;
  s/ I[iv]i*([^a-z])/$1/;

  print;
}