Commit f06afc08 authored by Birte Kristina Friesel's avatar Birte Kristina Friesel
Browse files

re-enable substring matching -> use both for station lookup

parent 3cccdc35
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@ Module::Build->new(
		'List::Compare' => '0.29',
		'List::MoreUtils' => 0,
		'List::Util' => 0,
		'List::UtilsBy' => 0,
		'LWP::UserAgent' => 0,
		'Text::LevenshteinXS' => 0,
		'XML::LibXML' => 0,
+2 −1
Original line number Diff line number Diff line
@@ -2,9 +2,10 @@ git HEAD

    * Result: Add info key 900
    * Station: Improve get_station matching quality by using the Levenshtein
      edit distance instead of simple substring matching
      edit distance instead in addition to simple substring matching
    * new dependency: Text::LevenshteinXS (see README for notes about
      drop-in replacements)
    * new dependency: List::UtilsBy

Travel::Status::DE::IRIS 1.02 - Tue May 26 2015

+1 −0
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ Dependencies
* DateTime::Format::Strptime
* List::Compare
* List::MoreUtils
* List::UtilsBy
* LWP::UserAgent
* Text::LevenshteinXS
* XML::LibXML
+5 −8
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ use 5.014;
use utf8;

use List::Util qw(min);
use List::UtilsBy qw(uniq_by);
use List::MoreUtils qw(firstval pairwise);
use Text::LevenshteinXS qw(distance);

@@ -15271,17 +15272,13 @@ sub get_station_by_name {

	my @distances   = map { distance( $nname, $_->[1] ) } @stations;
	my $min_dist    = min(@distances);
	my $minp1_dist  = min( grep { $_ != $min_dist } @distances );
	my @station_map = pairwise { [ $a, $b ] } @stations, @distances;

	# arbitrary selection: edit distance < 5 is probably a typo, >= 5
	# probably means the station does not exist / has an odd name
	if ( $min_dist < 5 ) {
		return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
	}
	my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations;
	my @levenshtein_matches
	  = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;

	# always return a list when the edit distance is large
	return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
	return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches );
}

1;
+9 −12
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ use 5.014;
use utf8;

use List::Util qw(min);
use List::UtilsBy qw(uniq_by);
use List::MoreUtils qw(firstval pairwise);
use Text::LevenshteinXS qw(distance);

@@ -154,17 +155,13 @@ sub get_station_by_name {

	my @distances   = map { distance( $nname, $_->[1] ) } @stations;
	my $min_dist    = min(@distances);
	my $minp1_dist  = min( grep { $_ != $min_dist } @distances );
	my @station_map = pairwise { [ $a, $b ] } @stations, @distances;

	# arbitrary selection: edit distance < 5 is probably a typo, >= 5
	# probably means the station does not exist / has an odd name
	if ( $min_dist < 5 ) {
		return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
	}
	my @substring_matches = grep { $_->[1] =~ m{$name}i } @stations;
	my @levenshtein_matches
	  = map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;

	# always return a list when the edit distance is large
	return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
	return uniq_by { $_->[0] } ( @substring_matches, @levenshtein_matches );
}

1;
Loading