Commit 3cccdc35 authored by Birte Kristina Friesel's avatar Birte Kristina Friesel
Browse files

use Text::Levenshtein(XS) for fuzzy station name matching

parent 4caa67e1
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@ Module::Build->new(
		'List::MoreUtils' => 0,
		'List::Util' => 0,
		'LWP::UserAgent' => 0,
		'Text::LevenshteinXS' => 0,
		'XML::LibXML' => 0,
	},
	sign => 1,
+4 −0
Original line number Diff line number Diff line
git HEAD

    * Result: Add info key 900
    * Station: Improve get_station matching quality by using the Levenshtein
      edit distance instead of simple substring matching
    * new dependency: Text::LevenshteinXS (see README for notes about
      drop-in replacements)

Travel::Status::DE::IRIS 1.02 - Tue May 26 2015

+14 −0
Original line number Diff line number Diff line
@@ -14,8 +14,22 @@ Dependencies
* List::Compare
* List::MoreUtils
* LWP::UserAgent
* Text::LevenshteinXS
* XML::LibXML

Note about Text::LevenshteinXS: This module is old and unmaintained, but
appears to be packaged for slightly more distros than its successor
Text::Levenshtein::XS. If it is not available for your distro (and you do
not wish to build it), the following drop-in replacements are available:

* Text::Levenshtein::XS
* Text::Levenshtein (about 10 times slower than the XS modules)

To use them, simply run:
> sed -i 's/Text::LevenshteinXS/Text::Levenshtein::XS/g' Build.PL lib/Travel/Status/DE/IRIS/Stations.pm
or
> sed -i 's/Text::LevenshteinXS/Text::Levenshtein/g' Build.PL lib/Travel/Status/DE/IRIS/Stations.pm

Installation
------------

+2 −1
Original line number Diff line number Diff line
@@ -153,7 +153,8 @@ sub get_station {
	else {
		say STDERR "The input '$input_name' is ambiguous. Please choose one "
		  . 'of the following:';
		say STDERR join( "\n", map { $_->[1] } @stations );
		say STDERR
		  join( "\n", map { $_->[1] . ' (' . $_->[0] . ')' } @stations );
		exit(1);
	}
}
+22 −2
Original line number Diff line number Diff line
@@ -5,7 +5,11 @@ use warnings;
use 5.014;
use utf8;

use List::MoreUtils qw(firstval);
use List::Util qw(min);
use List::MoreUtils qw(firstval pairwise);
use Text::LevenshteinXS qw(distance);

# TODO switch to Text::Levenshtein::XS once AUR/Debian packages become available

our $VERSION = '1.02';

@@ -15265,7 +15269,19 @@ sub get_station_by_name {
		return ($actual_match);
	}

	return ( grep { $_->[1] =~ m{$name}i } @stations );
	my @distances   = map { distance( $nname, $_->[1] ) } @stations;
	my $min_dist    = min(@distances);
	my $minp1_dist  = min( grep { $_ != $min_dist } @distances );
	my @station_map = pairwise { [ $a, $b ] } @stations, @distances;

	# arbitrary selection: edit distance < 5 is probably a typo, >= 5
	# probably means the station does not exist / has an odd name
	if ( $min_dist < 5 ) {
		return map { $_->[0] } grep { $_->[1] == $min_dist } @station_map;
	}

	# always return a list when the edit distance is large
	return map { $_->[0] } grep { $_->[1] <= $minp1_dist } @station_map;
}

1;
@@ -15355,6 +15371,10 @@ None.

=item * List::MoreUtils(3pm)

=item * List::Util(3pm)

=item * Text::LevenshteinXS(3pm)

=back

=head1 BUGS AND LIMITATIONS
Loading