pixelfed/app/Util/Lexer/Extractor.php

574 lines
18 KiB
PHP
Raw Normal View History

2018-06-09 03:31:42 +00:00
<?php
/**
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
*/
namespace App\Util\Lexer;
/**
2018-08-28 03:07:36 +00:00
* Twitter Extractor Class.
2018-06-09 03:31:42 +00:00
*
* Parses tweets and extracts URLs, usernames, username/list pairs and
* hashtags.
*
* Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
* is based on code by {@link http://github.com/mzsanford Matt Sanford} and
* heavily modified by {@link http://github.com/ngnpope Nick Pope}.
*
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
*/
class Extractor extends Regex
{
/**
2018-08-28 03:07:36 +00:00
* @var bool
2018-06-09 03:31:42 +00:00
*/
protected $extractURLWithoutProtocol = true;
/**
* Provides fluent method chaining.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to be converted.
2018-06-09 03:31:42 +00:00
*
* @see __construct()
*
2018-08-28 03:07:36 +00:00
* @return Extractor
2018-06-09 03:31:42 +00:00
*/
public static function create($tweet = null)
{
return new self($tweet);
}
/**
* Reads in a tweet to be parsed and extracts elements from it.
*
* Extracts various parts of a tweet including URLs, usernames, hashtags...
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
2018-06-09 03:31:42 +00:00
*/
public function __construct($tweet = null)
{
parent::__construct($tweet);
}
/**
* Extracts all parts of a tweet and returns an associative array containing
* the extracted elements.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extract($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
2018-08-28 03:07:36 +00:00
return [
'hashtags' => $this->extractHashtags($tweet),
'urls' => $this->extractURLs($tweet),
'mentions' => $this->extractMentionedUsernames($tweet),
'replyto' => $this->extractRepliedUsernames($tweet),
2018-06-09 03:31:42 +00:00
'hashtags_with_indices' => $this->extractHashtagsWithIndices($tweet),
2018-08-28 03:07:36 +00:00
'urls_with_indices' => $this->extractURLsWithIndices($tweet),
2018-06-09 03:31:42 +00:00
'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices($tweet),
2018-08-28 03:07:36 +00:00
];
2018-06-09 03:31:42 +00:00
}
/**
* Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
2018-06-09 03:31:42 +00:00
* @return array list of extracted entities
*/
public function extractEntitiesWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
2018-08-28 03:07:36 +00:00
$entities = [];
2018-06-09 03:31:42 +00:00
$entities = array_merge($entities, $this->extractURLsWithIndices($tweet));
$entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false));
$entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet));
$entities = array_merge($entities, $this->extractCashtagsWithIndices($tweet));
$entities = $this->removeOverlappingEntities($entities);
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $entities;
}
/**
* Extracts all the hashtags from the tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The hashtag elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractHashtags($tweet = null)
{
2018-08-28 03:07:36 +00:00
$hashtagsOnly = [];
2018-06-09 03:31:42 +00:00
$hashtagsWithIndices = $this->extractHashtagsWithIndices($tweet);
foreach ($hashtagsWithIndices as $hashtagWithIndex) {
$hashtagsOnly[] = $hashtagWithIndex['hashtag'];
}
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $hashtagsOnly;
}
/**
* Extracts all the cashtags from the tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The cashtag elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractCashtags($tweet = null)
{
2018-08-28 03:07:36 +00:00
$cashtagsOnly = [];
2018-06-09 03:31:42 +00:00
$cashtagsWithIndices = $this->extractCashtagsWithIndices($tweet);
foreach ($cashtagsWithIndices as $cashtagWithIndex) {
$cashtagsOnly[] = $cashtagWithIndex['cashtag'];
}
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $cashtagsOnly;
}
/**
* Extracts all the URLs from the tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The URL elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractURLs($tweet = null)
{
2018-08-28 03:07:36 +00:00
$urlsOnly = [];
2018-06-09 03:31:42 +00:00
$urlsWithIndices = $this->extractURLsWithIndices($tweet);
foreach ($urlsWithIndices as $urlWithIndex) {
$urlsOnly[] = $urlWithIndex['url'];
}
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $urlsOnly;
}
/**
* Extract all the usernames from the tweet.
*
* A mention is an occurrence of a username anywhere in a tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The usernames elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractMentionedScreennames($tweet = null)
{
2018-08-28 03:07:36 +00:00
$usernamesOnly = [];
2018-06-09 03:31:42 +00:00
$mentionsWithIndices = $this->extractMentionsOrListsWithIndices($tweet);
foreach ($mentionsWithIndices as $mentionWithIndex) {
$screen_name = mb_strtolower($mentionWithIndex['screen_name']);
2018-08-28 03:07:36 +00:00
if (empty($screen_name) or in_array($screen_name, $usernamesOnly)) {
2018-06-09 03:31:42 +00:00
continue;
}
$usernamesOnly[] = $screen_name;
}
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $usernamesOnly;
}
/**
* Extract all the usernames from the tweet.
*
* A mention is an occurrence of a username anywhere in a tweet.
*
2018-08-28 03:07:36 +00:00
* @return array The usernames elements in the tweet.
*
2018-06-09 03:31:42 +00:00
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernames($tweet)
{
$this->tweet = $tweet;
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $this->extractMentionedScreennames($tweet);
}
/**
* Extract all the usernames replied to from the tweet.
*
* A reply is an occurrence of a username at the beginning of a tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The usernames replied to in a tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractReplyScreenname($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches);
2018-08-28 03:07:36 +00:00
// Check username ending in
2018-06-09 03:31:42 +00:00
if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) {
$matched = false;
}
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $matched ? $matches[1] : null;
}
/**
* Extract all the usernames replied to from the tweet.
*
* A reply is an occurrence of a username at the beginning of a tweet.
*
2018-08-28 03:07:36 +00:00
* @return array The usernames replied to in a tweet.
*
2018-06-09 03:31:42 +00:00
* @deprecated since version 1.1.0
*/
public function extractRepliedUsernames()
{
return $this->extractReplyScreenname();
}
/**
* Extracts all the hashtags and the indices they occur at from the tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
* @param bool $checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove overlapping ones
*
* @return array The hashtag elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = true)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/[#]/iu', $tweet)) {
2018-08-28 03:07:36 +00:00
return [];
2018-06-09 03:31:42 +00:00
}
preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
2018-08-28 03:07:36 +00:00
$tags = [];
2018-06-09 03:31:42 +00:00
foreach ($matches as $match) {
2018-08-28 03:07:36 +00:00
list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, ['', 0]);
2018-06-09 03:31:42 +00:00
$start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1];
2018-08-28 03:07:36 +00:00
$end_position = $start_position + StringUtils::strlen($hash[0].$hashtag[0]);
2018-06-09 03:31:42 +00:00
if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
continue;
}
2018-08-28 03:07:36 +00:00
$tags[] = [
2018-06-09 03:31:42 +00:00
'hashtag' => $hashtag[0],
2018-08-28 03:07:36 +00:00
'indices' => [$start_position, $end_position],
];
2018-06-09 03:31:42 +00:00
}
if (!$checkUrlOverlap) {
return $tags;
}
2018-08-28 03:07:36 +00:00
// check url overlap
2018-06-09 03:31:42 +00:00
$urls = $this->extractURLsWithIndices($tweet);
$entities = $this->removeOverlappingEntities(array_merge($tags, $urls));
2018-08-28 03:07:36 +00:00
$validTags = [];
2018-06-09 03:31:42 +00:00
foreach ($entities as $entity) {
if (empty($entity['hashtag'])) {
continue;
}
$validTags[] = $entity;
}
return $validTags;
}
/**
* Extracts all the cashtags and the indices they occur at from the tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The cashtag elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractCashtagsWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/\$/iu', $tweet)) {
2018-08-28 03:07:36 +00:00
return [];
2018-06-09 03:31:42 +00:00
}
preg_match_all(self::$patterns['valid_cashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
2018-08-28 03:07:36 +00:00
$tags = [];
2018-06-09 03:31:42 +00:00
foreach ($matches as $match) {
2018-08-28 03:07:36 +00:00
list($all, $before, $dollar, $cash_text, $outer) = array_pad($match, 3, ['', 0]);
2018-06-09 03:31:42 +00:00
$start_position = $dollar[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $dollar[1])) : $dollar[1];
2018-08-28 03:07:36 +00:00
$end_position = $start_position + StringUtils::strlen($dollar[0].$cash_text[0]);
2018-06-09 03:31:42 +00:00
if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
continue;
}
2018-08-28 03:07:36 +00:00
$tags[] = [
2018-06-09 03:31:42 +00:00
'cashtag' => $cash_text[0],
2018-08-28 03:07:36 +00:00
'indices' => [$start_position, $end_position],
];
2018-06-09 03:31:42 +00:00
}
return $tags;
}
/**
* Extracts all the URLs and the indices they occur at from the tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The URLs elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractURLsWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$needle = $this->extractURLWithoutProtocol() ? '.' : ':';
if (strpos($tweet, $needle) === false) {
2018-08-28 03:07:36 +00:00
return [];
2018-06-09 03:31:42 +00:00
}
2018-08-28 03:07:36 +00:00
$urls = [];
2018-06-09 03:31:42 +00:00
preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
foreach ($matches as $match) {
2018-08-28 03:07:36 +00:00
list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, ['']);
2018-06-09 03:31:42 +00:00
$start_position = $url[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $url[1])) : $url[1];
$end_position = $start_position + StringUtils::strlen($url[0]);
$all = $all[0];
$before = $before[0];
$url = $url[0];
$protocol = $protocol[0];
$domain = $domain[0];
$port = $port[0];
$path = $path[0];
$query = $query[0];
// If protocol is missing and domain contains non-ASCII characters,
// extract ASCII-only domains.
if (empty($protocol)) {
if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) {
continue;
}
$last_url = null;
$ascii_end_position = 0;
if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) {
2018-08-28 03:07:36 +00:00
$asciiDomain[0] = preg_replace('/'.preg_quote($domain, '/').'/u', $asciiDomain[0], $url);
2018-06-09 03:31:42 +00:00
$ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position);
$ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]);
2018-08-28 03:07:36 +00:00
$last_url = [
'url' => $asciiDomain[0],
'indices' => [$start_position + $ascii_start_position, $start_position + $ascii_end_position],
];
2018-06-09 03:31:42 +00:00
if (!empty($path)
|| preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0])
|| !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) {
$urls[] = $last_url;
}
}
// no ASCII-only domain found. Skip the entire URL
if (empty($last_url)) {
continue;
}
// $last_url only contains domain. Need to add path and query if they exist.
if (!empty($path)) {
// last_url was not added. Add it to urls here.
2018-08-28 03:07:36 +00:00
$last_url['url'] = preg_replace('/'.preg_quote($domain, '/').'/u', $last_url['url'], $url);
2018-06-09 03:31:42 +00:00
$last_url['indices'][1] = $end_position;
}
} else {
// In the case of t.co URLs, don't allow additional path characters
if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) {
$url = $tcoUrlMatches[0];
$end_position = $start_position + StringUtils::strlen($url);
}
2018-08-28 03:07:36 +00:00
$urls[] = [
'url' => $url,
'indices' => [$start_position, $end_position],
];
2018-06-09 03:31:42 +00:00
}
}
return $urls;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The username elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractMentionedScreennamesWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
2018-08-28 03:07:36 +00:00
$usernamesOnly = [];
2018-06-09 03:31:42 +00:00
$mentions = $this->extractMentionsOrListsWithIndices($tweet);
foreach ($mentions as $mention) {
if (isset($mention['list_slug'])) {
unset($mention['list_slug']);
}
$usernamesOnly[] = $mention;
}
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $usernamesOnly;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
2018-08-28 03:07:36 +00:00
* @return array The username elements in the tweet.
*
2018-06-09 03:31:42 +00:00
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernamesWithIndices()
{
return $this->extractMentionedScreennamesWithIndices();
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
2018-08-28 03:07:36 +00:00
* @param string $tweet The tweet to extract.
*
* @return array The username elements in the tweet.
2018-06-09 03:31:42 +00:00
*/
public function extractMentionsOrListsWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/[@]/iu', $tweet)) {
2018-08-28 03:07:36 +00:00
return [];
2018-06-09 03:31:42 +00:00
}
preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
2018-08-28 03:07:36 +00:00
$results = [];
2018-06-09 03:31:42 +00:00
foreach ($matches as $match) {
2018-08-28 03:07:36 +00:00
list($all, $before, $at, $username, $list_slug, $outer) = array_pad($match, 6, ['', 0]);
2018-06-09 03:31:42 +00:00
$start_position = $at[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $at[1])) : $at[1];
$end_position = $start_position + StringUtils::strlen($at[0]) + StringUtils::strlen($username[0]);
2018-08-28 03:07:36 +00:00
$entity = [
2018-06-09 03:31:42 +00:00
'screen_name' => $username[0],
2018-08-28 03:07:36 +00:00
'list_slug' => $list_slug[0],
'indices' => [$start_position, $end_position],
];
2018-06-09 03:31:42 +00:00
if (preg_match(self::$patterns['end_mention_match'], $outer[0])) {
continue;
}
if (!empty($list_slug[0])) {
$entity['indices'][1] = $end_position + StringUtils::strlen($list_slug[0]);
}
$results[] = $entity;
}
return $results;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
2018-08-28 03:07:36 +00:00
* @return array The username elements in the tweet.
*
2018-06-09 03:31:42 +00:00
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernamesOrListsWithIndices()
{
return $this->extractMentionsOrListsWithIndices();
}
/**
2018-08-28 03:07:36 +00:00
* setter/getter for extractURLWithoutProtocol.
*
* @param bool $flag
2018-06-09 03:31:42 +00:00
*
* @return Extractor
*/
public function extractURLWithoutProtocol($flag = null)
{
if (is_null($flag)) {
return $this->extractURLWithoutProtocol;
}
$this->extractURLWithoutProtocol = (bool) $flag;
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $this;
}
/**
* Remove overlapping entities.
* This returns a new array with no overlapping entities.
*
* @param array $entities
2018-08-28 03:07:36 +00:00
*
2018-06-09 03:31:42 +00:00
* @return array
*/
public function removeOverlappingEntities($entities)
{
2018-08-28 03:07:36 +00:00
$result = [];
usort($entities, [$this, 'sortEntites']);
2018-06-09 03:31:42 +00:00
$prev = null;
foreach ($entities as $entity) {
if (isset($prev) && $entity['indices'][0] < $prev['indices'][1]) {
continue;
}
$prev = $entity;
$result[] = $entity;
}
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return $result;
}
/**
2018-08-28 03:07:36 +00:00
* sort by entity start index.
2018-06-09 03:31:42 +00:00
*
* @param array $a
* @param array $b
2018-08-28 03:07:36 +00:00
*
2018-06-09 03:31:42 +00:00
* @return int
*/
protected function sortEntites($a, $b)
{
if ($a['indices'][0] == $b['indices'][0]) {
return 0;
}
2018-08-28 03:07:36 +00:00
2018-06-09 03:31:42 +00:00
return ($a['indices'][0] < $b['indices'][0]) ? -1 : 1;
}
}