Loading Build.PL +1 −0 Original line number Original line Diff line number Diff line Loading @@ -28,6 +28,7 @@ Module::Build->new( 'List::Compare' => '0.29', 'List::Compare' => '0.29', 'List::MoreUtils' => 0, 'List::MoreUtils' => 0, 'List::Util' => 0, 'List::Util' => 0, 'List::UtilsBy' => 0, 'LWP::UserAgent' => 0, 'LWP::UserAgent' => 0, 'Text::LevenshteinXS' => 0, 'Text::LevenshteinXS' => 0, 'XML::LibXML' => 0, 'XML::LibXML' => 0, Loading Changelog +2 −1 Original line number Original line Diff line number Diff line Loading @@ -2,9 +2,10 @@ git HEAD * Result: Add info key 900 * Result: Add info key 900 * Station: Improve get_station matching quality by using the Levenshtein * Station: Improve get_station matching quality by using the Levenshtein edit distance instead of simple substring matching edit distance instead in addition to simple substring matching * new dependency: Text::LevenshteinXS (see README for notes about * new dependency: Text::LevenshteinXS (see README for notes about drop-in replacements) drop-in replacements) * new dependency: List::UtilsBy Travel::Status::DE::IRIS 1.02 - Tue May 26 2015 Travel::Status::DE::IRIS 1.02 - Tue May 26 2015 Loading README +1 −0 Original line number Original line Diff line number Diff line Loading @@ -13,6 +13,7 @@ Dependencies * DateTime::Format::Strptime * DateTime::Format::Strptime * List::Compare * List::Compare * List::MoreUtils * List::MoreUtils * List::UtilsBy * LWP::UserAgent * LWP::UserAgent * Text::LevenshteinXS * Text::LevenshteinXS * XML::LibXML * XML::LibXML Loading lib/Travel/Status/DE/IRIS/Stations.pm +5 −8 Original line number Original line Diff line number Diff line Loading @@ -6,6 +6,7 @@ use 5.014; use utf8; use utf8; use List::Util qw(min); use List::Util qw(min); use List::UtilsBy qw(uniq_by); use List::MoreUtils qw(firstval pairwise); use List::MoreUtils qw(firstval pairwise); use Text::LevenshteinXS qw(distance); use Text::LevenshteinXS qw(distance); Loading Loading @@ -15271,17 +15272,13 @@ sub get_station_by_name { my @distances = map { distance( $nname, $_->[1] ) } @stations; my @distances = map { distance( $nname, $_->[1] ) } @stations; my $min_dist = min(@distances); my $min_dist = min(@distances); my $minp1_dist = min( grep { $_ != $min_dist } @distances ); my @station_map = pairwise { [ $a, $b ] } @stations, @distances; my @station_map = pairwise { [ $a, $b ] } @stations, @distances; # arbitrary selection: edit distance < 5 is probably a typo, >= 5 my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations; # probably means the station does not exist / has an odd name my @levenshtein_matches if ( $min_dist < 5 ) { = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; } # always return a list when the edit distance is large return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches ); return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } } 1; 1; Loading scripts/acronyms.pl +9 −12 Original line number Original line Diff line number Diff line Loading @@ -16,6 +16,7 @@ use 5.014; use utf8; use utf8; use List::Util qw(min); use List::Util qw(min); use List::UtilsBy qw(uniq_by); use List::MoreUtils qw(firstval pairwise); use List::MoreUtils qw(firstval pairwise); use Text::LevenshteinXS qw(distance); use Text::LevenshteinXS qw(distance); Loading Loading @@ -125,7 +126,7 @@ sub normalize { } } sub get_station { sub get_station { my ( $name ) = @_; my ($name) = @_; my $ds100_match = firstval { $name eq $_->[0] } @stations; my $ds100_match = firstval { $name eq $_->[0] } @stations; Loading @@ -137,34 +138,30 @@ sub get_station { } } sub get_station_by_name { sub get_station_by_name { my ( $name ) = @_; my ($name) = @_; my $nname = lc($name); my $nname = lc($name); my $actual_match = firstval { $nname eq lc($_->[1]) } @stations; my $actual_match = firstval { $nname eq lc( $_->[1] ) } @stations; if ($actual_match) { if ($actual_match) { return ($actual_match); return ($actual_match); } } $nname = normalize($nname); $nname = normalize($nname); $actual_match = firstval { $nname eq normalize(lc($_->[1])) } @stations; $actual_match = firstval { $nname eq normalize( lc( $_->[1] ) ) } @stations; if ($actual_match) { if ($actual_match) { return ($actual_match); return ($actual_match); } } my @distances = map { distance( $nname, $_->[1] ) } @stations; my @distances = map { distance( $nname, $_->[1] ) } @stations; my $min_dist = min(@distances); my $min_dist = min(@distances); my $minp1_dist = min( grep { $_ != $min_dist } @distances ); my @station_map = pairwise { [ $a, $b ] } @stations, @distances; my @station_map = pairwise { [ $a, $b ] } @stations, @distances; # arbitrary selection: edit distance < 5 is probably a typo, >= 5 my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations; # probably means the station does not exist / has an odd name my @levenshtein_matches if ( $min_dist < 5 ) { = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; } # always return a list when the edit distance is large return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches ); return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } } 1; 1; Loading Loading
Build.PL +1 −0 Original line number Original line Diff line number Diff line Loading @@ -28,6 +28,7 @@ Module::Build->new( 'List::Compare' => '0.29', 'List::Compare' => '0.29', 'List::MoreUtils' => 0, 'List::MoreUtils' => 0, 'List::Util' => 0, 'List::Util' => 0, 'List::UtilsBy' => 0, 'LWP::UserAgent' => 0, 'LWP::UserAgent' => 0, 'Text::LevenshteinXS' => 0, 'Text::LevenshteinXS' => 0, 'XML::LibXML' => 0, 'XML::LibXML' => 0, Loading
Changelog +2 −1 Original line number Original line Diff line number Diff line Loading @@ -2,9 +2,10 @@ git HEAD * Result: Add info key 900 * Result: Add info key 900 * Station: Improve get_station matching quality by using the Levenshtein * Station: Improve get_station matching quality by using the Levenshtein edit distance instead of simple substring matching edit distance instead in addition to simple substring matching * new dependency: Text::LevenshteinXS (see README for notes about * new dependency: Text::LevenshteinXS (see README for notes about drop-in replacements) drop-in replacements) * new dependency: List::UtilsBy Travel::Status::DE::IRIS 1.02 - Tue May 26 2015 Travel::Status::DE::IRIS 1.02 - Tue May 26 2015 Loading
README +1 −0 Original line number Original line Diff line number Diff line Loading @@ -13,6 +13,7 @@ Dependencies * DateTime::Format::Strptime * DateTime::Format::Strptime * List::Compare * List::Compare * List::MoreUtils * List::MoreUtils * List::UtilsBy * LWP::UserAgent * LWP::UserAgent * Text::LevenshteinXS * Text::LevenshteinXS * XML::LibXML * XML::LibXML Loading
lib/Travel/Status/DE/IRIS/Stations.pm +5 −8 Original line number Original line Diff line number Diff line Loading @@ -6,6 +6,7 @@ use 5.014; use utf8; use utf8; use List::Util qw(min); use List::Util qw(min); use List::UtilsBy qw(uniq_by); use List::MoreUtils qw(firstval pairwise); use List::MoreUtils qw(firstval pairwise); use Text::LevenshteinXS qw(distance); use Text::LevenshteinXS qw(distance); Loading Loading @@ -15271,17 +15272,13 @@ sub get_station_by_name { my @distances = map { distance( $nname, $_->[1] ) } @stations; my @distances = map { distance( $nname, $_->[1] ) } @stations; my $min_dist = min(@distances); my $min_dist = min(@distances); my $minp1_dist = min( grep { $_ != $min_dist } @distances ); my @station_map = pairwise { [ $a, $b ] } @stations, @distances; my @station_map = pairwise { [ $a, $b ] } @stations, @distances; # arbitrary selection: edit distance < 5 is probably a typo, >= 5 my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations; # probably means the station does not exist / has an odd name my @levenshtein_matches if ( $min_dist < 5 ) { = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; } # always return a list when the edit distance is large return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches ); return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } } 1; 1; Loading
scripts/acronyms.pl +9 −12 Original line number Original line Diff line number Diff line Loading @@ -16,6 +16,7 @@ use 5.014; use utf8; use utf8; use List::Util qw(min); use List::Util qw(min); use List::UtilsBy qw(uniq_by); use List::MoreUtils qw(firstval pairwise); use List::MoreUtils qw(firstval pairwise); use Text::LevenshteinXS qw(distance); use Text::LevenshteinXS qw(distance); Loading Loading @@ -125,7 +126,7 @@ sub normalize { } } sub get_station { sub get_station { my ( $name ) = @_; my ($name) = @_; my $ds100_match = firstval { $name eq $_->[0] } @stations; my $ds100_match = firstval { $name eq $_->[0] } @stations; Loading @@ -137,34 +138,30 @@ sub get_station { } } sub get_station_by_name { sub get_station_by_name { my ( $name ) = @_; my ($name) = @_; my $nname = lc($name); my $nname = lc($name); my $actual_match = firstval { $nname eq lc($_->[1]) } @stations; my $actual_match = firstval { $nname eq lc( $_->[1] ) } @stations; if ($actual_match) { if ($actual_match) { return ($actual_match); return ($actual_match); } } $nname = normalize($nname); $nname = normalize($nname); $actual_match = firstval { $nname eq normalize(lc($_->[1])) } @stations; $actual_match = firstval { $nname eq normalize( lc( $_->[1] ) ) } @stations; if ($actual_match) { if ($actual_match) { return ($actual_match); return ($actual_match); } } my @distances = map { distance( $nname, $_->[1] ) } @stations; my @distances = map { distance( $nname, $_->[1] ) } @stations; my $min_dist = min(@distances); my $min_dist = min(@distances); my $minp1_dist = min( grep { $_ != $min_dist } @distances ); my @station_map = pairwise { [ $a, $b ] } @stations, @distances; my @station_map = pairwise { [ $a, $b ] } @stations, @distances; # arbitrary selection: edit distance < 5 is probably a typo, >= 5 my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations; # probably means the station does not exist / has an odd name my @levenshtein_matches if ( $min_dist < 5 ) { = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map; } # always return a list when the edit distance is large return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches ); return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map; } } 1; 1; Loading