Blob Blame History Raw
#!/usr/bin/perl -w

use File::Temp "tempfile";
use Text::Unidecode;

sub myunidecode {
  my ($text) = @_;
  $text = unidecode($text);
  $text =~ s/\@/a/g;
  return $text;
}

@intlglob = glob("sources/geonames_dd_dms_date_*.txt");
$intlsrc = $intlglob[$#intlglob];

$uscitysrc = "sources/POP_PLACES.txt";
$usbigsrc = "sources/US_CONCISE.txt";
$majorsrc = "major-cities.txt";
$weathersrc = "sources/nsd_cccc.txt";

$locationdb = "locationdb.sqlite";
unlink $locationdb;

($states, $states_filename) = tempfile();
binmode $states, ":utf8";
($counties, $counties_filename) = tempfile();
binmode $counties, ":utf8";
($cities, $cities_filename) = tempfile();
binmode $cities, ":utf8";

# The columns in $intlsrc: (interesting ones in caps)
# RC UFI uni LAT LONG dms_lat dms_long mgrs jog fc DSG PC CC1 ADM1 adm2 POP ELEV CC2 NT LC SHORT_FORM generic SORT_NAME FULL_NAME FULL_NAME_ND modify_date

print "Reading $intlsrc\n";
open INTLSRC, "-|:utf8", "perl ./city-fixups.pl < $intlsrc";
while (<INTLSRC>) {
  # skip header, and historical records
  next if $. == 1 || /\(\(/ || /\(historical\)/i;

  @fields = split /\t/;
  next if $#fields != 25;

  ($rc, $id, $junk, $latitude, $longitude, $junk, $junk, $junk,
   $junk, $junk, $designation, $pc, $country_code, $state_code, $junk,
   $population, $elevation, $cc2, $name_type, $language, $short_name,
   $junk, $flat_name, $long_name, $long_name_nd, $junk) = @fields;

  # Don't use wacky complex transliteration rules for non-European
  # names; just use the non-diacritic version of the name instead
  if ($name_type eq 'N' && (!$rc || $rc > 2)) {
    $long_name = myunidecode($long_name);
    $short_name = myunidecode($short_name);
  } elsif ($name_type eq 'NS' && (!$rc || $rc > 2)) {
    $long_name = $long_name_nd;
  }

  # Also fix smart quotes and dumb spacing
  $short_name =~ s/\x{2019}/'/g;
  $short_name =~ s/  +/ /;
  $short_name =~ s/ $//;
  $long_name =~ s/\x{2019}/'/g;
  $long_name =~ s/  +/ /;
  $long_name =~ s/ $//;

  if ($designation =~ /^PCL/) {
    # /^PCL/ matches countries and other ISO-3166-like things:
    # PCLD - dependent political entity
    # PCLF - freely associated state
    # PCLI - independent political entity (ie, "country")
    # PCLIX - section of independent political entity

    if ($short_name) { $countries{$short_name} = $country_code; }
    if ($long_name) { $countries{$long_name} = $country_code; }

  } elsif ($designation eq "ADM1") {
    # ID COUNTRY_CODE STATE_CODE NAME_TYPE LANGUAGE SHORT_NAME
    # LONG_NAME FLAT_NAME
    print $states "intl-$id\t$country_code\t$country_code$state_code\t$name_type\t$language\t$short_name\t$long_name\t$flat_name\n";

  } elsif ($designation =~ /^PPL/) {
    # "PPL*" is "Populated Place"

    # Assign an "importance" to the city. Having a positive
    # $importance makes a city "bigger" in terms of figuring out how
    # close it is to a given coordinate, and having negative
    # $importance makes it "smaller". Furthermore, a negative
    # importance city can never match solely on coordinates; it needs
    # to match based on name as well.
    #
    # $importance = 2 means furthermore that if we haven't found a
    # weather station for the city after going through the whole
    # stations table, then we'll add an entry using whatever weather
    # station is closest. $importance = 3 means it's a national
    # capital.
    #
    # Our current definition of "important" is that it has designation
    # of "PPLC" (capital of a country [that is, a /PCL.*/]) or "PPLA"
    # (capital of an ADM1). We also look at the "Populated Place
    # Classification" ($pc) and population, but as of 2008-05 those
    # aren't filled in yet for most countries.
    #
    # If $pc is 4 or 5 ("unimportant"), or $designation is "PPLX"
    # (neighborhood/subdivision), we give the city $importance = -1,
    # meaning it will only be used when a weather station actually
    # matches its name; it won't get pulled in as a "nearest city"
    # match.
    if ($designation eq "PPLC") {
      $importance = 3;
    } elsif ($population && $population > 100000) {
      $importance = 2;
    } elsif ($designation eq "PPLA" || $pc eq "1" || $pc eq "2") {
      $importance = 1;
    } elsif ($designation eq "PPLX" || $pc eq "4" || $pc eq "5") {
      $importance = -1;
    } else {
      $importance = 0;
    }

    # either sqlite or pysqlite treats ".9" as a string rather than a
    # number, so make sure we say "0.9" instead
    $latitude = 0 + $latitude;
    $longitude = 0 + $longitude;

    # ID LATITUDE LONGITUDE ELEVATION IMPORTANCE COUNTRY_CODE
    # STATE_CODE COUNTY_CODE NAME_TYPE LANGUAGE SHORT_NAME LONG_NAME
    # FLAT_NAME
    print $cities "intl-$id\t$latitude\t$longitude\t$elevation\t$importance\t$country_code\t$country_code$state_code\t\t$name_type\t$language\t$short_name\t$long_name\t$flat_name\n";

    # The dataset gives two countries for some features, usually
    # meaning either that they cross a national border, or are
    # disputed.
    if ($cc2) {
      print $cities "intl-$id\t$latitude\t$longitude\t$elevation\t$importance\t$cc2\t\t\t$name_type\t$language\t$short_name\t$long_name\t$flat_name\n";
    }
  }
}
close INTLSRC;

# $usbigsrc is the "Concise Features Gazetteer", containing "Large
# features that should be labeled on maps with a scale of 1:250,000".
# We use this to find medium-to-large cities. After reading through
# that, we read $uscitysrc, the "Populated Places Gazetteer", and add
# any remaining cities from there with $importance=-1. Both of these
# files have DOS newlines. (After importing the data into sqlite,
# we'll use $majorsrc to set $importance=2 on the largest US cities.)
%uscities = ();
$/ = "\r\n";

# The columns in $usbigsrc:
# feature_id feature_name class st_alpha st_num county county_num primary_lat_dms primary_lon_dms primary_lat_dec primary_lon_dec source_lat_dms source_lon_dms source_lat_dec source_lon_dec elev map_name

print "Reading $usbigsrc\n";
open USBIGSRC, "<:utf8", $usbigsrc;
while (<USBIGSRC>) {
  # skip header, and historical records
  next if $. == 1 || /\(historical\)/i;

  # remove CRLF and split
  chomp;
  @fields = split /\t/;
  next if $#fields != 16;
  ($feature_id, $name, $class, $state, $state_num, $junk, $county_num, $junk,
   $junk, $latitude, $longitude, $junk, $junk, $junk, $junk,
   $elevation, $junk) = @fields;
  $flat = uc(myunidecode($name));

  if ($class eq "Populated Place") {
    $importance = 0;
  } else {
    next;
  }

  # ID LATITUDE LONGITUDE ELEVATION IMPORTANCE COUNTRY_CODE STATE_CODE
  # COUNTY_CODE NAME_TYPE LANGUAGE SHORT_NAME LONG_NAME FLAT_NAME
  print $cities "us-$feature_id\t$latitude\t$longitude\t$elevation\t$importance\tUS\tUS$state_num\t$county_num\tN\teng\t$name\t\t$flat\n";
  $uscities{$feature_id} = 1;
}
close USBIGSRC;

# The columns in $uscitysrc:
# feature_id feature_name class	st_alpha st_num county county_num primary_lat_dms primary_lon_dms primary_lat_dec primary_lon_dec elev map_name
print "Reading $uscitysrc\n";
open USCITYSRC, "<:utf8", $uscitysrc;
while (<USCITYSRC>) {
  # skip header, and historical records
  next if $. == 1 || /\(historical\)/i;

  # remove CRLF and split
  chomp;
  @fields = split /\t/;
  next if $#fields != 12;
  ($feature_id, $name, $class, $state, $state_num, $county, $county_num,
   $junk, $junk, $latitude, $longitude, $elevation, $junk) = @fields;
  $flat = uc(myunidecode($name));

  if (!$uscities{$feature_id}) {
    # ID LATITUDE LONGITUDE ELEVATION IMPORTANCE COUNTRY_CODE STATE_CODE
    # COUNTY_CODE NAME_TYPE LANGUAGE SHORT_NAME LONG_NAME FLAT_NAME
    print $cities "us-$feature_id\t$latitude\t$longitude\t$elevation\t-1\tUS\tUS$state_num\t$county_num\tN\teng\t$name\t\t$flat\n";
  }

  # There are no separate entries for counties, so we just extract
  # that data out of the cities, keeping track of which counties we've
  # already added
  $county_id = "us-$state_num-$county_num";
  if (!$counties{$county_id}) {
    $counties{$county_id} = 1;

    # ID COUNTRY_CODE STATE_CODE COUNTY_CODE NAME_TYPE LANGUAGE
    # SHORT_NAME LONG_NAME FLAT_NAME
    print $counties "$county_id\tUS\tUS$state_num\t$county_num\tN\teng\t$county\t\t\n";
  }
}
close USCITYSRC;

# Now add some additional data that's not in any source file...

# $intlsrc has no entry for the US or its dependencies
$countries{"United States"} = "US";
$countries{"United States of America"} = "US";
$countries{"United States Minor Outlying Islands"} = "US";
$countries{"Micronesia, Federated States of"} = "US";
# These are in $intlsrc, but not listed as countries
$countries{"Ă…land Islands"} = "FI01";
$countries{"Antarctica"} = "AY";
$countries{"Faroe Islands"} = "FO";
$countries{"French Guiana"} = "FG";
$countries{"French Polynesia"} = "FP";
$countries{"Greenland"} = "GL";
$countries{"Guadeloupe"} = "GP";
$countries{"Martinique"} = "MB";
$countries{"New Caledonia"} = "NC";
$countries{"Reunion"} = "RE";
$countries{"Saint Pierre and Miquelon"} = "SB";
$countries{"Svalbard"} = "SV";
$countries{"Wallis and Futuna"} = "WF";
# These are variations that appear in $weathersrc
$countries{"Congo, Republic of the"} = "CF";
$countries{"Congo, Democratic Republic of the"} = "CG";
$countries{"Cote d'Ivoire"} = "IV";
$countries{"Falkland Islands, Islas Malvinas"} = "FK";
$countries{"Gambia, The"} = "GA";
$countries{"Korea, North"} = "KN";
$countries{"Korea, South"} = "KS";
$countries{"Macedonia, The Republic of"} = "MK";
$countries{"People's Republic of China"} = "CH";
$countries{"Viet Nam"} = "VM";

$usstatedata = << "EOF";
01	AL	Alabama
02	AK	Alaska
04	AZ	Arizona
05	AR	Arkansas
06	CA	California
08	CO	Colorado
09	CT	Connecticut
10	DE	Delaware
11	DC	District of Columbia
12	FL	Florida
13	GA	Georgia
15	HI	Hawaii
16	ID	Idaho
17	IL	Illinois
18	IN	Indiana
19	IA	Iowa
20	KS	Kansas
21	KY	Kentucky
22	LA	Louisiana
23	ME	Maine
24	MD	Maryland
25	MA	Massachusetts
26	MI	Michigan
27	MN	Minnesota
28	MS	Mississippi
29	MO	Missouri
30	MT	Montana
31	NE	Nebraska
32	NV	Nevada
33	NH	New Hampshire
34	NJ	New Jersey
35	NM	New Mexico
36	NY	New York
37	NC	North Carolina
38	ND	North Dakota
39	OH	Ohio
40	OK	Oklahoma
41	OR	Oregon
42	PA	Pennsylvania
44	RI	Rhode Island
45	SC	South Carolina
46	SD	South Dakota
47	TN	Tennessee
48	TX	Texas
49	UT	Utah
50	VT	Vermont
51	VA	Virginia
53	WA	Washington
54	WV	West Virginia
55	WI	Wisconsin
56	WY	Wyoming
60	AS	American Samoa
64	FM	Federated States of Micronesia
66	GU	Guam
69	MP	Northern Mariana Islands
70	PW	Palau
72	PR	Puerto Rico
74	UM	United States Minor Outlying Islands
78	VI	United States Virgin Islands
EOF
# ID COUNTRY_CODE STATE_CODE NAME_TYPE LANGUAGE SHORT_NAME
# LONG_NAME FLAT_NAME
for $usstate (split(/\n/, $usstatedata)) {
  ($fips, $abbrev, $name) = split(/\t/, $usstate);
  ($flat = uc $name) =~ s/ //g;
  $states{$name} = "US$fips";
  $states{$abbrev} = "US$fips";
  print $states "us-$fips\tUS\tUS$fips\tN\teng\t$abbrev\t$name\t$flat\n";
}

close $states;
close $counties;
close $cities;

print "\nCreating adm1 ('states') table\n";
open SQLITE, "|-:utf8", "sqlite3 $locationdb";
print SQLITE "CREATE TABLE adm1 (id TEXT, country_code TEXT, state_code TEXT, name_type TEXT, language TEXT, short_name TEXT, long_name TEXT, flat_name TEXT);\n";
print SQLITE ".separator '\t'\n";
print SQLITE ".import $states_filename adm1\n";
close SQLITE;
#unlink $states_filename;

print "Creating adm2 ('counties') table\n";
open SQLITE, "|-:utf8", "sqlite3 $locationdb";
print SQLITE "CREATE TABLE adm2 (id TEXT, country_code TEXT, state_code TEXT, county_code TEXT, name_type TEXT, language TEXT, short_name TEXT, long_name TEXT, flat_name TEXT);\n";
print SQLITE ".separator '\t'\n";
print SQLITE ".import $counties_filename adm2\n";
close SQLITE;
#unlink $counties_filename;

print "Creating cities table\n";
open SQLITE, "|-:utf8", "sqlite3 $locationdb";
print SQLITE "CREATE TABLE cities (id TEXT, latitude REAL, longitude REAL, elevation REAL, importance INTEGER, country_code TEXT, state_code TEXT, county_code TEXT, name_type TEXT, language TEXT, short_name TEXT, long_name TEXT, flat_name TEXT);\n";
print SQLITE ".separator '\t'\n";
print SQLITE ".import $cities_filename cities\n";
close SQLITE;
#unlink $cities_filename;

print "Indexing cities\n";
open SQLITE, "|-:utf8", "sqlite3 $locationdb";
print SQLITE "CREATE INDEX short_name ON cities (country_code, short_name);\n";
print SQLITE "CREATE INDEX long_name ON cities (country_code, long_name);\n";
print SQLITE "CREATE INDEX flat_name ON cities (country_code, flat_name);\n";
print SQLITE "CREATE INDEX long_lat ON cities (country_code, longitude, latitude);\n";
print SQLITE "CREATE INDEX id ON cities (id);\n";
close SQLITE;

# Update cities by marking cities "important" if they have the same
# name as their ADM1. (This seems to do a good job of picking up major
# cities in some countries that don't have PC or POP data.)
open SQLITE, "|-:utf8", "sqlite3 $locationdb";
print SQLITE "UPDATE cities SET importance=1 WHERE importance < 1 AND id IN (SELECT cities.id FROM cities INNER JOIN adm1 ON cities.country_code = adm1.country_code AND cities.state_code = adm1.state_code AND cities.flat_name = adm1.flat_name);\n";
print SQLITE "UPDATE cities SET importance=1 WHERE importance < 1 AND id IN (SELECT cities.id FROM cities INNER JOIN adm1 ON cities.country_code = adm1.country_code AND cities.state_code = adm1.state_code AND cities.long_name = adm1.short_name);\n";
close SQLITE;

# Update cities table with additional information about major cities
$/ = "\n";
open MAJORSRC, "<:utf8", $majorsrc;
open SQLITE, "|-:utf8", "sqlite3 $locationdb";

while (<MAJORSRC>) {
  chomp;
  s/\s*#.*//;

  @fields = split /\t/;
  next if $#fields != 4;
  ($country, $latitude, $longitude, $importance, $city) = @fields;

  $city =~ s/'/''/g; # for SQL

  if ($latitude && $longitude) {
    $latlo = $latitude - 0.1;
    $lathi = $latitude + 0.1;
    $lonlo = $longitude - 0.1;
    $lonhi = $longitude + 0.1;
    print SQLITE "UPDATE cities SET importance=$importance WHERE ( short_name='$city' OR long_name='$city' ) AND country_code='$country' AND importance!=-1 AND latitude BETWEEN $latlo AND $lathi AND longitude BETWEEN $lonlo AND $lonhi;\n";
  } else {
    print SQLITE "UPDATE cities SET importance=$importance WHERE ( short_name='$city' OR long_name='$city' ) AND country_code='$country' AND importance!=-1;\n";
  }
}

close MAJORSRC;
close SQLITE;

# The columns in $weathersrc are
# CODE wmo_zone wmo_id NAME STATE COUNTRY region LATITUDE_DMS LONGITUDE_DMS ue_lat ue_long ELEVATION upper_elevation primary
# (however, some lines are mising one or both of the last two fields!)
#
# The columns in the stations table will be
# CODE NAME STATE COUNTRY LATITUDE LONGITUDE ELEVATION

sub dms_to_dec {
  my ($coord) = @_;
  return 0 if $coord !~ /(\d+)-(\d+)(-(\d+))?([NSEW])/;
  my $dec = $1 + $2 / 60;
  if ($4) { $dec += $4 / 3600; }
  if ($5 =~ /[SW]/) { $dec = -$dec; }
  return $dec;
}

print "\nCreating weather stations\n";
($stations, $stations_filename) = tempfile();
open WEATHERSRC, "-|:utf8", "perl ./station-fixups.pl < $weathersrc";
while (<WEATHERSRC>) {
  # remove CRLF and split
  chomp;
  @fields = split /;/;
  next if $#fields < 11;
  
  ($code, $junk, $id, $location, $state, $country, $junk, $latitude, $longitude, $junk, $junk, $elevation) = @fields;

  $latitude = dms_to_dec($latitude);
  $longitude = dms_to_dec($longitude);

  $country_code = $countries{$country};
  if (!$country_code) {
    $alt_country = s/^(.*), (.*)$/$2 $1/;
    $country_code = $countries{$alt_country};
  }
  if (!$country_code) {
    if ($country) { print "No country for '$country'\n"; }
    $country_code = '';
  }
  if ($state) {
    $state_code = $states{$state};
    if (!$state_code) {
      print "No state for '$state'\n";
      next;
    }
  } else {
    $state_code = "";
  }

  print $stations "$code;$location;$state_code;$country_code;$latitude;$longitude;$elevation\n";
}
close WEATHERSRC;
close $stations;

open SQLITE, "|-:utf8", "sqlite3 $locationdb";
print SQLITE "CREATE TABLE stations (code TEXT, name TEXT, state TEXT, country TEXT, latitude REAL, longitude REAL, elevation REAL);\n";
print SQLITE ".separator ';'\n";
print SQLITE ".import $stations_filename stations\n";
print SQLITE "CREATE INDEX station_long_lat ON stations (longitude, latitude);\n";
close SQLITE;

# unlink $stations_filename;