pixelfed/app/Util/Lexer/Classifier.php

179 lines
3.3 KiB
PHP

<?php
namespace App\Util\Lexer;
use Brick\Math\BigDecimal;
use Illuminate\Support\Collection;
use Illuminate\Support\Str;
class Classifier
{
/**
* @var ?callable(string): array<int, string>
*/
private $tokenizer;
/**
* @var array<string, array<string, int>>
*/
private array $words = [];
/**
* @var array<string, int>
*/
private array $documents = [];
private bool $uneven = false;
/**
* @param callable(string): array<int, string> $tokenizer
*/
public function setTokenizer(callable $tokenizer): void
{
$this->tokenizer = $tokenizer;
}
/**
* @return Collection<int, string>
*/
public function tokenize(string $string): Collection
{
if ($this->tokenizer) {
/** @var array<int, string> */
$tokens = call_user_func($this->tokenizer, $string);
return collect($tokens);
}
return Str::of($string)
->lower()
->matchAll('/[[:alpha:]]+/u');
}
/**
* @return $this
*/
public function learn(string $statement, string $type): self
{
foreach ($this->tokenize($statement) as $word) {
$this->incrementWord($type, $word);
}
$this->incrementType($type);
return $this;
}
/**
* @return Collection<string, string>
*/
public function guess(string $statement): Collection
{
$words = $this->tokenize($statement);
return collect($this->documents)
->map(function ($count, string $type) use ($words) {
$likelihood = $this->pTotal($type);
foreach ($words as $word) {
$likelihood *= $this->p($word, $type);
}
return (string) BigDecimal::of($likelihood);
})
->sortDesc();
}
public function most(string $statement): string
{
/** @var string */
return $this->guess($statement)->keys()->first();
}
/**
* @return self
*/
public function uneven(bool $enabled = false): self
{
$this->uneven = $enabled;
return $this;
}
/**
* Increment the document count for the type
*/
private function incrementType(string $type): void
{
if (! isset($this->documents[$type])) {
$this->documents[$type] = 0;
}
$this->documents[$type]++;
}
/**
* Increment the word count for the given type
*/
private function incrementWord(string $type, string $word): void
{
$ignored = config('autospam.ignored_tokens');
if(!$ignored) {
$ignored = ['the', 'a', 'of', 'and'];
} else {
$ignored = explode(',', $ignored);
}
if ($type == 'spam' && in_array($word, $ignored)) {
return;
}
if (! isset($this->words[$type][$word])) {
$this->words[$type][$word] = 0;
}
$this->words[$type][$word]++;
}
/**
* @return float|int
*/
private function p(string $word, string $type)
{
$count = $this->words[$type][$word] ?? 0;
return ($count + 1) / (array_sum($this->words[$type]) + 1);
}
/**
* @return float|int
*/
private function pTotal(string $type)
{
return $this->uneven
? ($this->documents[$type] + 1) / (array_sum($this->documents) + 1)
: 1;
}
public function export()
{
$words = $this->words;
$words = collect($words)
->map(function($w) {
arsort($w);
return $w;
})
->all();
return json_encode([
'_ns' => 'https://pixelfed.org/ns/nlp',
'_v' => '1.0',
'documents' => $this->documents,
'words' => $words
], JSON_PRETTY_PRINT|JSON_UNESCAPED_SLASHES);
}
public function import($documents, $words)
{
$this->documents = $documents;
$this->words = $words;
}
}