__ __ __ __ _____ _ _ _____ _ _ _ | \/ | \ \ / / | __ \ (_) | | / ____| | | | | | \ / |_ __\ V / | |__) | __ ___ ____ _| |_ ___ | (___ | |__ ___| | | | |\/| | '__|> < | ___/ '__| \ \ / / _` | __/ _ \ \___ \| '_ \ / _ \ | | | | | | |_ / . \ | | | | | |\ V / (_| | || __/ ____) | | | | __/ | | |_| |_|_(_)_/ \_\ |_| |_| |_| \_/ \__,_|\__\___| |_____/|_| |_|\___V 2.1 if you need WebShell for Seo everyday contact me on Telegram Telegram Address : @jackleetFor_More_Tools:
<?php
declare(strict_types=1);
namespace Phpml\Tokenization;
use Phpml\Exception\InvalidArgumentException;
class NGramTokenizer extends WordTokenizer
{
/**
* @var int
*/
private $minGram;
/**
* @var int
*/
private $maxGram;
public function __construct(int $minGram = 1, int $maxGram = 2)
{
if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
}
$this->minGram = $minGram;
$this->maxGram = $maxGram;
}
/**
* {@inheritdoc}
*/
public function tokenize(string $text): array
{
$words = [];
preg_match_all('/\w\w+/u', $text, $words);
$nGrams = [];
foreach ($words[0] as $word) {
$this->generateNGrams($word, $nGrams);
}
return $nGrams;
}
private function generateNGrams(string $word, array &$nGrams): void
{
$length = mb_strlen($word);
for ($j = 1; $j <= $this->maxGram; $j++) {
for ($k = 0; $k < $length - $j + 1; $k++) {
if ($j >= $this->minGram) {
$nGrams[] = mb_substr($word, $k, $j);
}
}
}
}
}
| Name | Type | Size | Permission | Actions |
|---|---|---|---|---|
| NGramTokenizer.php | File | 1.29 KB | 0777 |
|
| NGramWordTokenizer.php | File | 1.41 KB | 0777 |
|
| Tokenizer.php | File | 139 B | 0777 |
|
| WhitespaceTokenizer.php | File | 462 B | 0777 |
|
| WordTokenizer.php | File | 273 B | 0777 |
|