Blame data/station-fixups.pl

rpm-build ca2b01
#!/usr/bin/perl
rpm-build ca2b01
rpm-build ca2b01
binmode STDIN, ":utf8";
rpm-build ca2b01
binmode STDOUT, ":utf8";
rpm-build ca2b01
rpm-build ca2b01
while (<>) {
rpm-build ca2b01
  ### Drop certain stations
rpm-build ca2b01
  next if / Platform *;/; # offshort oil platforms
rpm-build ca2b01
  next if /^LYPZ;/; # buggy duplicate
rpm-build ca2b01
rpm-build ca2b01
  ### Whitespace/punctuation cleanup
rpm-build ca2b01
  s/   */ /g;
rpm-build ca2b01
  s/\\/\//g;
rpm-build ca2b01
  s/([^ ])\//$1 \//g;
rpm-build ca2b01
  s/\/([^ ])/\/ $1/g;
rpm-build ca2b01
  s/,([^ ])/, $1/g;
rpm-build ca2b01
  s/ ,/,/g;
rpm-build ca2b01
  s/[ ,\/]*;/;/g;
rpm-build ca2b01
  s/; /;/g;
rpm-build ca2b01
rpm-build ca2b01
  ### Capitalization, etc
rpm-build ca2b01
  s/Mc /Mc/g;
rpm-build ca2b01
  s/ Of / of /g;
rpm-build ca2b01
  s/([a-z]) D(a |e |el |es |i |o |u |\')/$1 d$2/g;
rpm-build ca2b01
  s/([a-z]) L(a|es?) /$1 l$2 /g;
rpm-build ca2b01
  # lowercasify a capital letter after an apostrophe, unless the
rpm-build ca2b01
  # preceding letter was "d" (eg, "Cote d'Ivoire")
rpm-build ca2b01
  s/([a-ce-z]\'[A-Z])/\L$1/g; 
rpm-build ca2b01
rpm-build ca2b01
rpm-build ca2b01
  ### Fix incorrect or outdated codes
rpm-build ca2b01
  if (/;Angelholm;;Sweden;/)   { s/ESDB/ESTA/; }
rpm-build ca2b01
  if (/;M\. Calamita;;Italy;/) { s/LIRJ/LIRX/; }
rpm-build ca2b01
  if (/;Yerevan;;Armenia;/)    { s/UGEE/UDYZ/; }
rpm-build ca2b01
  if (/;Novosibirsk;;Russia;/) { s/UNNN/UNNT/; }
rpm-build ca2b01
  if (/;Jinan;;China;/)        { s/ZSTN/ZSJN/; }
rpm-build ca2b01
rpm-build ca2b01
  ### Fix invalid or incorrect coordinates
rpm-build ca2b01
  if (/^K3MW;/) { s/;40-26-94N;106-44-95W;/;40-27N;106-45W;/; }
rpm-build ca2b01
  if (/^KBKB;/) { s/;092-97W;/;093-00W;/; }
rpm-build ca2b01
  if (/^KBJN;/) { s/;37-37-02;/;37-37-02N;/; }
rpm-build ca2b01
  if (/^KFHU;/) { s/;46-98N;/;31-35N;/; }
rpm-build ca2b01
  if (/^KWTR;/) { s/;104-87W;/;105-00W;/; }
rpm-build ca2b01
  if (/^MMML;/) { s/;117-00W;/;115-14W;/; }
rpm-build ca2b01
  if (/^MNBL;/) { s/;086-46W;/;083-46W;/; }
rpm-build ca2b01
  if (/^PGNT;/) { s/;14-96N;/;15-00N;/; }
rpm-build ca2b01
rpm-build ca2b01
rpm-build ca2b01
  ### Fix some country divisions to match FIPS codes
rpm-build ca2b01
  if (/^EGJA;/) { s/;United Kingdom;/;Guernsey;/; }
rpm-build ca2b01
  if (/^EGJB;/) { s/;United Kingdom;/;Guernsey;/; }
rpm-build ca2b01
  if (/^EGJJ;/) { s/;United Kingdom;/;Jersey;/; }
rpm-build ca2b01
  if (/^EGNS;/) { s/;United Kingdom;/;Isle of Man;/; }
rpm-build ca2b01
  if (/^EKVG;/) { s/;Denmark;/;Faroe Islands;/; }
rpm-build ca2b01
  if (/^ENSB;/) { s/;Norway;/;Svalbard;/; }
rpm-build ca2b01
  if (/^FMCZ;/) { s/;Comoros;/;Mayotte;/; }
rpm-build ca2b01
  if (/^NLWW;/) { s/;France;/;Wallis and Futuna;/; }
rpm-build ca2b01
  if (/^PLCH;/) { s/;New Zealand;/;Kiribati;/; }
rpm-build ca2b01
  if (/^TI..;/) { s/;;Virgin Islands;/;VI;United States;/; }
rpm-build ca2b01
  if (/^YSNF;/) { s/;Australia;/;Norfolk Island;/; }
rpm-build ca2b01
rpm-build ca2b01
rpm-build ca2b01
  ### Fix some country/state divisions to correct bugs
rpm-build ca2b01
  if (/^EGYP;/) { s/;South Georgia and the Islands;/;Falkland Islands (Islas Malvinas);/; }
rpm-build ca2b01
  if (/^HHAS;/) { s/;Ethiopia;/;Eritrea;/; } # 502576
rpm-build ca2b01
  if (/^LY..;/) { s/;Serbia and Montenegro;/;;/; } # This will force update-locations to figure them out itself
rpm-build ca2b01
  if (/^NIUE;/) { s/;Cook Islands;/;Niue;/; }
rpm-build ca2b01
  if (/^NSTU;/) { s/;;United States Minor Outlying Islands;/;AS;United States;/; }
rpm-build ca2b01
  if (/^NZWD;/) { s/, Antarctic;;New Zealand;/;;Antarctica;/; }
rpm-build ca2b01
  if (/^PMDY;/) { s/;HI;/;UM;/; }
rpm-build ca2b01
  if (/^PWAK;/) { s/;GU;/;UM;/; }
rpm-build ca2b01
  if (/^TKPN;/) { s/;Antigua and Barbuda;/;Saint Kitts and Nevis;/; }
rpm-build ca2b01
  if (/^YPCC;/) { s/;Christmas Island;/;Cocos (Keeling) Islands;/; }
rpm-build ca2b01
rpm-build ca2b01
rpm-build ca2b01
  ### Fix some spelling mistakes/wackiness/nonstandardnesses. Mostly
rpm-build ca2b01
  ### alphabetical by station code
rpm-build ca2b01
  if (/^EDDF;/) { s/ \/ M-Flughafen//; }
rpm-build ca2b01
  if (/^ETWM;/) { s/-Mil;/ Military Base;/; }
rpm-build ca2b01
  if (/^FAJS;/) { s/;Johannesburg International Airport;/;O. R. Tambo International Airport;/; } # 533622
rpm-build ca2b01
  if (/^HESH;/) { s/Sheikhintl/Sheikh Intl/; }
rpm-build ca2b01
  if (/^HKJK;/) { s/ TWR \/ APP \/ NOF \/ Civil Airlines//; }
rpm-build ca2b01
  if (/^LBGO;/) { s/Orechovista/Oryakhovitsa/; } # 313655
rpm-build ca2b01
  if (/^LEJR;/) { s/Fronteraaeropuerto/Frontera Aeropuerto/; }
rpm-build ca2b01
  if (/^LFPG;/) { s/Paris-Aeroport Charles de Gaulle/Paris, Charles de Gaulle International Airport/; }
rpm-build ca2b01
  if (/^LIPO;/) { s/Montichia;/Montichiara;/; } # 350945
rpm-build ca2b01
  if (/^LOAV;/) { s/Lugplatz/Flugplatz/; }
rpm-build ca2b01
  if (/^MMGL;/) { s/Guadalaj;/Guadalajara;/; }
rpm-build ca2b01
  if (/^MMMD;/) { s/ lic / /; }
rpm-build ca2b01
  if (/^MNMG;/) { s/Managua A. C. Sandino/Managua, A. C. Sandino Airport/; }
rpm-build ca2b01
  if (/^MTPP;/) { s/ \/ Aeroport International/ International Airport/; }
rpm-build ca2b01
  if (/^MWCR;/) { s/Airportgrand/Airport, Grand/; }
rpm-build ca2b01
  # 319538 - the entry for OIAG actually has the data for OIAJ
rpm-build ca2b01
  if (/^OIAG;/) { s/Omidieh/Aghajari/; s/30-46N;049-40E/30-44-44N;049-40-35E/; }
rpm-build ca2b01
  if (/^OIKB;/) { s/Bandarabbass/Bandar Abbas/; }
rpm-build ca2b01
  if (/^OINN;/) { s/Noshahr/Now Shahr/; }
rpm-build ca2b01
  if (/^OITR;/) { s/Orumieh/Orumiyeh/; }
rpm-build ca2b01
  if (/^PGUM;/) { s/Agana/Hagåtña/; } # to match POP_PLACES
rpm-build ca2b01
  if (/^SABE;/) { s/Aeroparque Bs\. As\./Buenos Aires, Jorge Newbery/; }
rpm-build ca2b01
  if (/^SBFZ;/) { s/pinto/Pinto/; }
rpm-build ca2b01
  if (/^SPHY;/) { s/Andahuayla/Andahuaylas/; }
rpm-build ca2b01
  if (/^SPIM;/) { s/Aerop\. Internacional Jorgechavez/Jorge Chavez International Airport/; }
rpm-build ca2b01
  if (/^SVMI;/) { s/Maiquetia Aerop\. Intl\. Simon Bolivar/Simon Bolivar International/; }
rpm-build ca2b01
  if (/^TAPA;/) { s/Vc /V. C. /; }
rpm-build ca2b01
  if (/^TKPN;/) { s/Newcast;/Newcastle;/; }
rpm-build ca2b01
  if (/^UBBG;/) { s/Gyanca/Gyandzha/; }
rpm-build ca2b01
  if (/^UKDR;/) { s/Krivyy/Kryvyy/; }
rpm-build ca2b01
rpm-build ca2b01
rpm-build ca2b01
  ### "Move" some stations to keep them from matching irrelevant cities
rpm-build ca2b01
  if (/^VTBD;/) { s/Don Muang/Bangkok/; }
rpm-build ca2b01
rpm-build ca2b01
rpm-build ca2b01
  ### Untranslate/unabbreviate the word "Airport". (The names in
rpm-build ca2b01
  ### nsd_cccc.txt don't seem to be especially close to
rpm-build ca2b01
  ### correct/official, so this is a net win.
rpm-build ca2b01
  s/Aerop\. Internacional ([^,;]*)/$1 International Airport/;
rpm-build ca2b01
  s/Aeropuerto[^ ]* ([^,;]*)/$1 Airport/;
rpm-build ca2b01
  s/Aeroporto* d[ea] ([^,;]*)/$1 Airport/;
rpm-build ca2b01
  s/[ -]Aero(|\.|drome|porto?|-Porto|puerto)( |;)/ Airport$2/;
rpm-build ca2b01
  s/Air(-Port|p\.)/Airport/;
rpm-build ca2b01
  s/Civ \/ (Mil|Afb)/Airport/;
rpm-build ca2b01
  s/( \/)? Civ(|il|ilian);/$1 Airport;/;
rpm-build ca2b01
  s/Lufthavn/Airport/;
rpm-build ca2b01
  s/Int\'?l\.?/International/;
rpm-build ca2b01
  s/Int\./International/;
rpm-build ca2b01
  s/Inter-National/International/;
rpm-build ca2b01
  s/Internationalairport;/International Airport;/;
rpm-build ca2b01
  s/International;/International Airport;/;
rpm-build ca2b01
  s/Airport ([A-Z])/Airport, $1/;
rpm-build ca2b01
  # Change "Foo / Airport" to "Foo Airport"
rpm-build ca2b01
  s/;([^;]*)(,| \/) (International Airport|Airport);/;$1 $3;/;
rpm-build ca2b01
  # And "Foo / Bar Airport" to "Foo, Bar Airport"
rpm-build ca2b01
  s/;([^;\/,]*) \/ ([^;\/,]* Airport)/;$1, $2/;
rpm-build ca2b01
rpm-build ca2b01
  s/,? ([a-z][a-z]*-)?afb/ Air Force Base/i;
rpm-build ca2b01
  s/ ([A-Z][a-z]*-)?Ab;/ Air Base;/;
rpm-build ca2b01
  s/Usa . Af/US Air Force Base/;
rpm-build ca2b01
  s/Usaf/US Air Force Base/;
rpm-build ca2b01
  s/Air Force Operated Base In Foreign Country/Air Force Base/;
rpm-build ca2b01
  s/ (Can-)?Mil(\.|itary);/ Military Base;/;
rpm-build ca2b01
rpm-build ca2b01
  s/Obs(\.|erv\.|ervatory|ervatorio)/Observatory/;
rpm-build ca2b01
rpm-build ca2b01
  # US National Weather Service, but appears not just in /;United States;/
rpm-build ca2b01
  s/, NWS Office//;
rpm-build ca2b01
  # Likewise Australian Weather Service (or Automated Weather Station?)
rpm-build ca2b01
  s/,? Aws;/;/;
rpm-build ca2b01
rpm-build ca2b01
  ### Country-specific fixups, sorted alphabetically by country
rpm-build ca2b01
rpm-build ca2b01
  if (/;Argentina;/) {
rpm-build ca2b01
    # Remove province name from location description
rpm-build ca2b01
    s/, (BA|B\. A\.|CHT|SF);/;/;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;Australia;/) {
rpm-build ca2b01
    # ??
rpm-build ca2b01
    s/ (Amo|Mo);/ Airport;/;
rpm-build ca2b01
    s/,? M\. O\.?;/;/;
rpm-build ca2b01
rpm-build ca2b01
    s/ Ran / Royal Australian Navy /;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;Austria;/) {
rpm-build ca2b01
    s/-Flughafen/ Airport/;
rpm-build ca2b01
    s/Flugplatz/Airport/;
rpm-build ca2b01
    s/ Am / am /;
rpm-build ca2b01
    s/ Im / im /;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;Canada;/) {
rpm-build ca2b01
    # Remove province/territory name from location description
rpm-build ca2b01
    s/,? (Alta|B\. C|Man|N\. B|Nfld|N\. S|N\. W\. T|Ont|P\. E\. I|Prince Edward Island|Que|Sask|Y\. T)\.?;/;/;
rpm-build ca2b01
rpm-build ca2b01
    # Canadian Department of Agriculture, [Remote] Climate Station
rpm-build ca2b01
    s/ Cda//i;
rpm-build ca2b01
    s/ R?CS//;
rpm-build ca2b01
rpm-build ca2b01
    s/Airport, [^;]*Station;/Airport;/;
rpm-build ca2b01
rpm-build ca2b01
    # /CX../ stations are automated. Maybe we should drop all of them,
rpm-build ca2b01
    # but for now we'll just drop the ones where there's also a
rpm-build ca2b01
    # corresponding non-automated station
rpm-build ca2b01
    if (/^CX(DE|EC|EG|MI|MM|OX|TV|WN|ZU)/) {
rpm-build ca2b01
      next;
rpm-build ca2b01
    }
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;Cuba;/) {
rpm-build ca2b01
    s/, Oriente//;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;Mexico;/) {
rpm-build ca2b01
    # Remove state name from location description
rpm-build ca2b01
    s/,? (Ags|B\. C\. S|Camp|Chis|Coah|Mor|N\. L|Nay|Pue|Q\. Roo|Qro|S\. L\. P|Sin|Son)\.?;/;/;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;Netherlands;/) {
rpm-build ca2b01
    s/;([^;]*) Airport, ([^;]*);/;$1, $2 Airport;/;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;New Caledonia;/) {
rpm-build ca2b01
    s/ Nlle-Caledonie//;
rpm-build ca2b01
    s/ Ile [^;]*//;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;Sweden;/) {
rpm-build ca2b01
    s/Flygplats/Airport/;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  if (/;United States;/) {
rpm-build ca2b01
    s/Nexrad/NEXRAD Station/;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  ### Final airport fixing...
rpm-build ca2b01
rpm-build ca2b01
  ### The location data for several countries (including all of South
rpm-build ca2b01
  ### America) uses the convention "City / Airport Name"
rpm-build ca2b01
  if (/^(AG|AY|DA|EL|FC|FMM|FO|FX|FZ|GA|GM|HE|LI|LJ|MK|MM|MP|OI|PL|S|US|UU|UW|VV|WA|WI|WM|WR|WS|Z)/) {
rpm-build ca2b01
    s/;([^;]*) \/ ([^;\/]*)/;$1, $2 Airport/;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  ### Some do it backwards
rpm-build ca2b01
  if (/^(EN|LE|MS|TT)/) {
rpm-build ca2b01
    s/;([^;]*) \/ ([^;\/]*)/;$1 Airport, $2/;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  ### In some countries, you generally need to prefix the city name to
rpm-build ca2b01
  ### the airport name
rpm-build ca2b01
  if (/^(C|ES|GO|LF|LH|LK|LS|LT|LZ)/) {
rpm-build ca2b01
    s/;([^;]*) \/ ([^;\/]*)/;$1, $1-$2 Airport/;
rpm-build ca2b01
  }
rpm-build ca2b01
rpm-build ca2b01
  # Some of our fixes end up resulting in "Airport Airport"
rpm-build ca2b01
  s/Airport Airport/Airport/;
rpm-build ca2b01
rpm-build ca2b01
  # Remove numbers in "Foo 1", "Foo 2", "Foo Iii", etc
rpm-build ca2b01
  s/ [123];/;/;
rpm-build ca2b01
  s/ I[iv]i*([^a-z])/$1/;
rpm-build ca2b01
rpm-build ca2b01
  print;
rpm-build ca2b01
}