Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.idea
vendor
composer.lock
7 changes: 5 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
}
],
"require": {

"pwfisher/command-line-php": "*"
},
"minimum-stability": "dev",
"require-dev": {
Expand All @@ -23,5 +23,8 @@
"psr-0": {
"LanguageDetector": "lib/"
}
}
},
"bin": [
"create-language-detector"
]
}
61 changes: 61 additions & 0 deletions create-language-detector
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env php
<?php

// register the autoloader
require 'vendor/autoload.php';

// it could use a little bit of memory, but it's fine
// because this process runs once.
ini_set('memory_limit', '1G');

// parse parameters
$args = CommandLine::parseArgs($_SERVER['argv']);

$samples = array();
$options = array();

foreach ($args as $key => $arg) {
if (is_numeric($key)) {
$samples[$key] = $arg;
} else {
$options[$key] = $arg;
}
}

if (count($samples) === 0 || isset($options['help']) || isset($options['?'])) {
echo "Usage: ".$_SERVER['argv'][0]." <language sample files> [options]\n";
echo "Options:\n";
echo " --output, -o <filename> Sets the output script file path. Defaults to 'language.php'\n";
echo " --help, -? Print this help\n";
exit;
}


$outputFile = 'language.php';

if (isset($options['output'])) { $outputFile = $options['output']; }
if (isset($options['o'])) { $outputFile = $options['o']; }


// we load the configuration (which will be serialized
// later into our language model file
$config = new LanguageDetector\Config;

$c = new LanguageDetector\Learn($config);
foreach ($samples as $sample) {
foreach (glob($sample) as $file) {
// feed with examples ('language', 'text');
$c->addSample(basename($file), file_get_contents($file));
}
}

// some callback so we know where the process is
$c->addStepCallback(function($lang, $status) {
echo "Learning {$lang}: $status\n";
});

// save it in `datafile`.
// we currently support the `php` serialization but it's trivial
// to add other formats, just extend `\LanguageDetector\Format\AbstractFormat`.
//You can check example at https://github.com/crodas/LanguageDetector/blob/master/lib/LanguageDetector/Format/PHP.php
$c->save(\LanguageDetector\AbstractFormat::initFormatByPath($outputFile));
43 changes: 43 additions & 0 deletions detect-language
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env php
<?php

// register the autoloader
require 'vendor/autoload.php';

// get stdin text
$text = file_get_contents('php://stdin');

// parse parameters
$args = CommandLine::parseArgs($_SERVER['argv']);

if (isset($args['help']) || isset($args['?'])) {
echo "Usage: cat <text file> | ".$_SERVER['argv'][0]." [options]\n";
echo "Options:\n";
echo " --detector, -d <filename> Sets the language detector script file path. Defaults to 'language.php'\n";
echo " --help, -? Print this help\n";
exit;
}


$languageScript = 'language.php';

if (isset($args['detector'])) { $languageScript = $args['detector']; }
if (isset($args['d'])) { $languageScript = $args['d']; }


// we load the language model, it would create
// the $config object for us.
$detect = LanguageDetector\Detect::initByPath($languageScript);

// get the 5 most probable guesses
$languages = $detect->detect($text);
$languages = array_slice($languages, 0, 5);

// print result
echo "Detected languages:\n";

foreach ($languages as $candidate) {
$lang = $candidate['lang'];
$score = $candidate['score'] * 100;
echo " $lang:\t".number_format($score, 1)."%\n";
}
11 changes: 9 additions & 2 deletions lib/LanguageDetector/Detect.php
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ public function getLanguages()
{
return array_keys($this->data);
}

public function detect($text, $limit = 300)
public function detectLanguageScores($text, $limit = 300)
{
$chunks = $this->parser->splitText($text, $limit);
$results = array();
Expand Down Expand Up @@ -162,6 +162,13 @@ public function detect($text, $limit = 300)
usort($distance, function($a, $b) {
return $a['score'] > $b['score'] ? -1 : 1;
});

return $distance;
}

public function detect($text, $limit = 300)
{
$distance = $this->detectLanguageScores($text, $limit);

if ($distance[0]['score'] - $distance[1]['score'] <= $this->threshold) {
/** We're not sure at all, we return the whole array then */
Expand Down