postgis/extensions/address_standardizer/mk-city-regex.pl
2020-07-01 11:00:58 +02:00

51 lines
1.1 KiB
Perl

#!/usr/bin/env perl
use strict;
use warnings;
use Regexp::Assemble;
my @cities = split(/[\r\n]+/, qx(cat usps-st-city-name.txt));
my %st= ();
for my $x (@cities) {
my ($st, $ct) = split(/\t/, $x);
push @{$st{$st}}, $ct;
}
my $re;
my $ra = Regexp::Assemble->new(flags => "i");
my %re =();
for my $x (sort keys %st) {
$ra->add(@{$st{$x}});
$re = $ra->re;
$re =~ s/\\/\\\\/g;
$re{$x} = $re;
}
print "#define NUM_STATES " . scalar (keys %re) . "\n\n";
print " static const char *states[NUM_STATES] = \n";
print " {\"" . join('","', sort keys %re) . "\"};\n\n";
print " static const char *stcities[NUM_STATES] = {\n";
my $cnt = 0;
my $a = '';
my $b = '';
for my $x (sort keys %re) {
$re = "(?:\\\\b)($re{$x})\$";
print " ,\n" if $cnt;
print " /* -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- */\n";
while ($re =~ s/^(.{1,65})//) {
$a = $1;
if ($a =~ s/(\\+)$//) {
print " \"$b$a\"\n";
$b = $1;
}
else {
print " \"$b$a\"\n";
$b = '';
}
}
$cnt++;
}
print " };\n";