1
0
Fork 0
pixelfed/app/Util/Lexer/Extractor.php

546 lines
17 KiB
PHP
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/**
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
*/
namespace App\Util\Lexer;
/**
* Twitter Extractor Class.
*
* Parses tweets and extracts URLs, usernames, username/list pairs and
* hashtags.
*
* Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
* is based on code by {@link http://github.com/mzsanford Matt Sanford} and
* heavily modified by {@link http://github.com/ngnpope Nick Pope}.
*
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
*/
class Extractor extends Regex
{
/**
* @var bool
*/
protected $extractURLWithoutProtocol = true;
/**
* Provides fluent method chaining.
*
* @param string $tweet The tweet to be converted.
*
* @see __construct()
*
* @return Extractor
*/
public static function create($tweet = null)
{
return new self($tweet);
}
/**
* Reads in a tweet to be parsed and extracts elements from it.
*
* Extracts various parts of a tweet including URLs, usernames, hashtags...
*
* @param string $tweet The tweet to extract.
*/
public function __construct($tweet = null)
{
parent::__construct($tweet);
}
/**
* Extracts all parts of a tweet and returns an associative array containing
* the extracted elements.
*
* @param string $tweet The tweet to extract.
*
* @return array The elements in the tweet.
*/
public function extract($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
return [
'hashtags' => $this->extractHashtags($tweet),
'urls' => $this->extractURLs($tweet),
'mentions' => $this->extractMentionedUsernames($tweet),
'replyto' => $this->extractRepliedUsernames($tweet),
'hashtags_with_indices' => $this->extractHashtagsWithIndices($tweet),
'urls_with_indices' => $this->extractURLsWithIndices($tweet),
'mentions_with_indices' => $this->extractMentionedUsernamesWithIndices($tweet),
];
}
/**
* Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array list of extracted entities
*/
public function extractEntitiesWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$entities = [];
$entities = array_merge($entities, $this->extractURLsWithIndices($tweet));
$entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false));
$entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet));
$entities = $this->removeOverlappingEntities($entities);
return $entities;
}
/**
* Extracts all the hashtags from the tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The hashtag elements in the tweet.
*/
public function extractHashtags($tweet = null)
{
$hashtagsOnly = [];
$hashtagsWithIndices = $this->extractHashtagsWithIndices($tweet);
foreach ($hashtagsWithIndices as $hashtagWithIndex) {
$hashtagsOnly[] = $hashtagWithIndex['hashtag'];
}
return $hashtagsOnly;
}
/**
* Extracts all the cashtags from the tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The cashtag elements in the tweet.
*/
public function extractCashtags($tweet = null)
{
$cashtagsOnly = [];
$cashtagsWithIndices = $this->extractCashtagsWithIndices($tweet);
foreach ($cashtagsWithIndices as $cashtagWithIndex) {
$cashtagsOnly[] = $cashtagWithIndex['cashtag'];
}
return $cashtagsOnly;
}
/**
* Extracts all the URLs from the tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The URL elements in the tweet.
*/
public function extractURLs($tweet = null)
{
$urlsOnly = [];
$urlsWithIndices = $this->extractURLsWithIndices($tweet);
foreach ($urlsWithIndices as $urlWithIndex) {
$urlsOnly[] = $urlWithIndex['url'];
}
return $urlsOnly;
}
/**
* Extract all the usernames from the tweet.
*
* A mention is an occurrence of a username anywhere in a tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The usernames elements in the tweet.
*/
public function extractMentionedScreennames($tweet = null)
{
$usernamesOnly = [];
$mentionsWithIndices = $this->extractMentionsOrListsWithIndices($tweet);
foreach ($mentionsWithIndices as $mentionWithIndex) {
$screen_name = mb_strtolower($mentionWithIndex['screen_name']);
if (empty($screen_name) or in_array($screen_name, $usernamesOnly)) {
continue;
}
$usernamesOnly[] = $screen_name;
}
return $usernamesOnly;
}
/**
* Extract all the usernames from the tweet.
*
* A mention is an occurrence of a username anywhere in a tweet.
*
* @return array The usernames elements in the tweet.
*
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernames($tweet)
{
$this->tweet = $tweet;
return $this->extractMentionedScreennames($tweet);
}
/**
* Extract all the usernames replied to from the tweet.
*
* A reply is an occurrence of a username at the beginning of a tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The usernames replied to in a tweet.
*/
public function extractReplyScreenname($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$matched = preg_match(self::$patterns['valid_reply'], $tweet, $matches);
// Check username ending in
if ($matched && preg_match(self::$patterns['end_mention_match'], $matches[2])) {
$matched = false;
}
return $matched ? $matches[1] : null;
}
/**
* Extract all the usernames replied to from the tweet.
*
* A reply is an occurrence of a username at the beginning of a tweet.
*
* @return array The usernames replied to in a tweet.
*
* @deprecated since version 1.1.0
*/
public function extractRepliedUsernames()
{
return $this->extractReplyScreenname();
}
/**
* Extracts all the hashtags and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
* @param bool $checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove overlapping ones
*
* @return array The hashtag elements in the tweet.
*/
public function extractHashtagsWithIndices($tweet = null, $checkUrlOverlap = true)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/[#]/iu', $tweet)) {
return [];
}
preg_match_all(self::$patterns['valid_hashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$tags = [];
foreach ($matches as $match) {
list($all, $before, $hash, $hashtag, $outer) = array_pad($match, 3, ['', 0]);
$start_position = $hash[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $hash[1])) : $hash[1];
$end_position = $start_position + StringUtils::strlen($hash[0].$hashtag[0]);
if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
continue;
}
$tags[] = [
'hashtag' => $hashtag[0],
'indices' => [$start_position, $end_position],
];
}
if (!$checkUrlOverlap) {
return $tags;
}
// check url overlap
$urls = $this->extractURLsWithIndices($tweet);
$entities = $this->removeOverlappingEntities(array_merge($tags, $urls));
$validTags = [];
foreach ($entities as $entity) {
if (empty($entity['hashtag'])) {
continue;
}
$validTags[] = $entity;
}
return $validTags;
}
/**
* Extracts all the cashtags and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The cashtag elements in the tweet.
*/
public function extractCashtagsWithIndices($tweet = null)
{
}
/**
* Extracts all the URLs and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The URLs elements in the tweet.
*/
public function extractURLsWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$needle = $this->extractURLWithoutProtocol() ? '.' : ':';
if (strpos($tweet, $needle) === false) {
return [];
}
$urls = [];
preg_match_all(self::$patterns['valid_url'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
foreach ($matches as $match) {
list($all, $before, $url, $protocol, $domain, $port, $path, $query) = array_pad($match, 8, ['']);
$start_position = $url[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $url[1])) : $url[1];
$end_position = $start_position + StringUtils::strlen($url[0]);
$all = $all[0];
$before = $before[0];
$url = $url[0];
$protocol = $protocol[0];
$domain = $domain[0];
$port = $port[0];
$path = $path[0];
$query = $query[0];
// If protocol is missing and domain contains non-ASCII characters,
// extract ASCII-only domains.
if (empty($protocol)) {
if (!$this->extractURLWithoutProtocol || preg_match(self::$patterns['invalid_url_without_protocol_preceding_chars'], $before)) {
continue;
}
$last_url = null;
$ascii_end_position = 0;
if (preg_match(self::$patterns['valid_ascii_domain'], $domain, $asciiDomain)) {
$asciiDomain[0] = preg_replace('/'.preg_quote($domain, '/').'/u', $asciiDomain[0], $url);
$ascii_start_position = StringUtils::strpos($domain, $asciiDomain[0], $ascii_end_position);
$ascii_end_position = $ascii_start_position + StringUtils::strlen($asciiDomain[0]);
$last_url = [
'url' => $asciiDomain[0],
'indices' => [$start_position + $ascii_start_position, $start_position + $ascii_end_position],
];
if (!empty($path)
|| preg_match(self::$patterns['valid_special_short_domain'], $asciiDomain[0])
|| !preg_match(self::$patterns['invalid_short_domain'], $asciiDomain[0])) {
$urls[] = $last_url;
}
}
// no ASCII-only domain found. Skip the entire URL
if (empty($last_url)) {
continue;
}
// $last_url only contains domain. Need to add path and query if they exist.
if (!empty($path)) {
// last_url was not added. Add it to urls here.
$last_url['url'] = preg_replace('/'.preg_quote($domain, '/').'/u', $last_url['url'], $url);
$last_url['indices'][1] = $end_position;
}
} else {
// In the case of t.co URLs, don't allow additional path characters
if (preg_match(self::$patterns['valid_tco_url'], $url, $tcoUrlMatches)) {
$url = $tcoUrlMatches[0];
$end_position = $start_position + StringUtils::strlen($url);
}
$urls[] = [
'url' => $url,
'indices' => [$start_position, $end_position],
];
}
}
return $urls;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The username elements in the tweet.
*/
public function extractMentionedScreennamesWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
$usernamesOnly = [];
$mentions = $this->extractMentionsOrListsWithIndices($tweet);
foreach ($mentions as $mention) {
if (isset($mention['list_slug'])) {
unset($mention['list_slug']);
}
$usernamesOnly[] = $mention;
}
return $usernamesOnly;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
* @return array The username elements in the tweet.
*
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernamesWithIndices()
{
return $this->extractMentionedScreennamesWithIndices();
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
* @param string $tweet The tweet to extract.
*
* @return array The username elements in the tweet.
*/
public function extractMentionsOrListsWithIndices($tweet = null)
{
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/[@]/iu', $tweet)) {
return [];
}
preg_match_all(self::$patterns['valid_mentions_or_lists'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$results = [];
foreach ($matches as $match) {
list($all, $before, $at, $username, $list_slug, $outer) = array_pad($match, 6, ['', 0]);
$start_position = $at[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $at[1])) : $at[1];
$end_position = $start_position + StringUtils::strlen($at[0]) + StringUtils::strlen($username[0]);
$entity = [
'screen_name' => $username[0],
'list_slug' => $list_slug[0],
'indices' => [$start_position, $end_position],
];
if (preg_match(self::$patterns['end_mention_match'], $outer[0])) {
continue;
}
if (!empty($list_slug[0])) {
$entity['indices'][1] = $end_position + StringUtils::strlen($list_slug[0]);
}
$results[] = $entity;
}
return $results;
}
/**
* Extracts all the usernames and the indices they occur at from the tweet.
*
* @return array The username elements in the tweet.
*
* @deprecated since version 1.1.0
*/
public function extractMentionedUsernamesOrListsWithIndices()
{
return $this->extractMentionsOrListsWithIndices();
}
/**
* setter/getter for extractURLWithoutProtocol.
*
* @param bool $flag
*
* @return Extractor
*/
public function extractURLWithoutProtocol($flag = null)
{
if (is_null($flag)) {
return $this->extractURLWithoutProtocol;
}
$this->extractURLWithoutProtocol = (bool) $flag;
return $this;
}
/**
* Remove overlapping entities.
* This returns a new array with no overlapping entities.
*
* @param array $entities
*
* @return array
*/
public function removeOverlappingEntities($entities)
{
$result = [];
usort($entities, [$this, 'sortEntites']);
$prev = null;
foreach ($entities as $entity) {
if (isset($prev) && $entity['indices'][0] < $prev['indices'][1]) {
continue;
}
$prev = $entity;
$result[] = $entity;
}
return $result;
}
/**
* sort by entity start index.
*
* @param array $a
* @param array $b
*
* @return int
*/
protected function sortEntites($a, $b)
{
if ($a['indices'][0] == $b['indices'][0]) {
return 0;
}
return ($a['indices'][0] < $b['indices'][0]) ? -1 : 1;
}
}