1
0
Fork 0
mirror of https://github.com/Jackett/Jackett synced 2024-12-25 01:07:38 +00:00

shazbat: refactor search and parsing (#13979)

This commit is contained in:
Bogdan 2023-02-07 04:29:23 +02:00 committed by GitHub
parent 8a35175d31
commit fe93e54ac1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 215 additions and 92 deletions

View file

@ -5,9 +5,10 @@ using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using AngleSharp.Dom;
using AngleSharp.Html.Parser;
using Jackett.Common.Models;
using Jackett.Common.Models.IndexerConfig;
using Jackett.Common.Models.IndexerConfig.Bespoke;
using Jackett.Common.Services.Interfaces;
using Jackett.Common.Utils;
using Jackett.Common.Utils.Clients;
@ -22,20 +23,16 @@ namespace Jackett.Common.Indexers
private string LoginUrl => SiteLink + "login";
private string SearchUrl => SiteLink + "search";
private string TorrentsUrl => SiteLink + "torrents";
private string ShowUrl => SiteLink + "show?id=";
private string ShowUrl => SiteLink + "show";
private string RSSProfile => SiteLink + "rss_feeds";
private new ConfigurationDataBasicLoginWithRSS configData
{
get => (ConfigurationDataBasicLoginWithRSS)base.configData;
set => base.configData = value;
}
private new ConfigurationDataShazbat configData => (ConfigurationDataShazbat)base.configData;
public Shazbat(IIndexerConfigurationService configService, WebClient c, Logger l, IProtectionService ps,
ICacheService cs)
: base(id: "shazbat",
name: "Shazbat",
description: "Modern indexer",
description: "Shazbat is a PRIVATE Torrent Tracker with highly curated TV content",
link: "https://www.shazbat.tv/",
caps: new TorznabCapabilities
{
@ -49,141 +46,236 @@ namespace Jackett.Common.Indexers
logger: l,
p: ps,
cacheService: cs,
configData: new ConfigurationDataBasicLoginWithRSS())
configData: new ConfigurationDataShazbat())
{
Encoding = Encoding.UTF8;
Language = "en-US";
Type = "private";
webclient.requestDelay = 5.1;
AddCategoryMapping(1, TorznabCatType.TV);
AddCategoryMapping(2, TorznabCatType.TVSD);
AddCategoryMapping(3, TorznabCatType.TVHD);
}
private int ShowPagesFetchLimit => int.TryParse(configData.ShowPagesFetchLimit.Value, out var limit) && limit > 0 && limit <= 5 ? limit : 2;
public override async Task<IndexerConfigurationStatus> ApplyConfiguration(JToken configJson)
{
LoadValuesFromJson(configJson);
var pairs = new Dictionary<string, string>
{
{"referer", "login"},
{"query", ""},
{"tv_login", configData.Username.Value},
{"tv_password", configData.Password.Value},
{"email", ""}
{ "referer", "" },
{ "query", "" },
{ "tv_timezone", "0" },
{ "tv_login", configData.Username.Value },
{ "tv_password", configData.Password.Value }
};
// Get cookie
var result = await RequestLoginAndFollowRedirect(LoginUrl, pairs, null, true, null, LoginUrl);
await ConfigureIfOK(result.Cookies, result.ContentString?.Contains("glyphicon-log-out") == true,
() => throw new ExceptionWithConfigData("The username and password entered do not match.", configData));
await ConfigureIfOK(result.Cookies, result.ContentString?.Contains("glyphicon-log-out") == true, () =>
{
throw new ExceptionWithConfigData("The username and password entered do not match.", configData);
});
var rssProfile = await RequestWithCookiesAndRetryAsync(RSSProfile);
var parser = new HtmlParser();
var rssDom = parser.ParseDocument(rssProfile.ContentString);
configData.RSSKey.Value = rssDom.QuerySelector(".col-sm-9:nth-of-type(1)").TextContent.Trim();
configData.RSSKey.Value = rssDom.QuerySelector(".col-sm-9:nth-of-type(1)")?.TextContent.Trim();
if (string.IsNullOrWhiteSpace(configData.RSSKey.Value))
throw new ExceptionWithConfigData("Failed to find RSS key.", configData);
SaveConfig();
return IndexerConfigurationStatus.RequiresTesting;
}
protected override async Task<IEnumerable<ReleaseInfo>> PerformQuery(TorznabQuery query)
{
WebResult response;
var releases = new List<ReleaseInfo>();
var queryString = query.GetQueryString();
WebResult results = null;
var searchUrls = new List<string>();
if (!string.IsNullOrWhiteSpace(query.SanitizedSearchTerm))
var searchUrls = new List<WebRequest>();
var hasGlobalFreeleech = false;
var searchTerm = query.SanitizedSearchTerm;
var term = FixSearchTerm(searchTerm);
var showTorrentsHeaders = new Dictionary<string, string>
{
var pairs = new Dictionary<string, string>
{ "Content-Type", "application/x-www-form-urlencoded" },
{ "X-Requested-With", "XMLHttpRequest" },
};
var showTorrentsBody = new Dictionary<string, string>
{
{ "portlet", "true" },
{ "tab", "true" }
};
if (!string.IsNullOrWhiteSpace(term))
{
var searchBody = new Dictionary<string, string>
{
{"search", query.SanitizedSearchTerm}
{ "search", term }
};
results = await RequestWithCookiesAndRetryAsync(
SearchUrl, null, RequestType.POST, TorrentsUrl, pairs);
results = await ReloginIfNecessary(results);
response = await RequestWithCookiesAndRetryAsync(SearchUrl, method: RequestType.POST, referer: TorrentsUrl, data: searchBody);
response = await ReloginIfNecessaryAsync(response);
var parser = new HtmlParser();
var dom = parser.ParseDocument(results.ContentString);
var dom = parser.ParseDocument(response.ContentString);
hasGlobalFreeleech = dom.QuerySelector("span:contains(\"Freeleech until:\"):has(span.datetime)") != null;
releases.AddRange(ParseResults(response, query, searchTerm, hasGlobalFreeleech));
var shows = dom.QuerySelectorAll("div.show[data-id]");
foreach (var show in shows)
if (shows.Any())
{
var showUrl = ShowUrl + show.GetAttribute("data-id");
searchUrls.Add(showUrl);
}
}
else
searchUrls.Add(TorrentsUrl);
var showPagesFetchLimit = ShowPagesFetchLimit;
try
{
foreach (var searchUrl in searchUrls)
{
results = await RequestWithCookiesAsync(searchUrl);
results = await ReloginIfNecessary(results);
var parser = new HtmlParser();
var dom = parser.ParseDocument(results.ContentString);
var rows = dom.QuerySelectorAll(
string.IsNullOrWhiteSpace(queryString) ? "#torrent-table tr" : "table tr");
var globalFreeleech =
dom.QuerySelector("span:contains(\"Freeleech until:\"):has(span.datetime)") != null;
foreach (var row in rows.Skip(1))
if (showPagesFetchLimit < 1 || showPagesFetchLimit > 5)
throw new Exception($"Value for Show Pages Fetch Limit should be between 1 and 5. Current value: {showPagesFetchLimit}.");
if (shows.Length > showPagesFetchLimit)
logger.Debug($"Your search returned {shows.Length} shows. Use a more specific search term for more relevant results.");
foreach (var show in shows.Take(showPagesFetchLimit))
{
// TODO switch to initializer
var release = new ReleaseInfo();
var titleRow = row.QuerySelector("td:nth-of-type(3)");
foreach (var child in titleRow.Children)
child.Remove();
release.Title = titleRow.TextContent.Trim();
if ((query.ImdbID == null || !TorznabCaps.MovieSearchImdbAvailable) &&
!query.MatchQueryStringAND(release.Title))
continue;
var posterStyle = row.QuerySelector("div[style^=\"cursor: pointer; background-image:url\"]")
?.GetAttribute("style");
if (!string.IsNullOrEmpty(posterStyle))
var showTorrentsQueryParams = new Dictionary<string, string>
{
var posterStr = Regex.Match(posterStyle, @"url\('(.*?)'\);").Groups[1].Value;
release.Poster = new Uri(SiteLink + posterStr);
}
{ "id", show.GetAttribute("data-id") },
{ "show_mode", "torrents" }
};
var qLink = row.QuerySelector("td:nth-of-type(5) a");
release.Link = new Uri(SiteLink + qLink.GetAttribute("href"));
release.Guid = release.Link;
var qLinkComm = row.QuerySelector("td:nth-of-type(5) a.internal");
release.Details = new Uri(SiteLink + qLinkComm.GetAttribute("href"));
var dateString = row.QuerySelector(".datetime")?.GetAttribute("data-timestamp");
if (dateString != null)
release.PublishDate = DateTimeUtil.UnixTimestampToDateTime(ParseUtil.CoerceDouble(dateString));
var infoString = row.QuerySelector("td:nth-of-type(4)").TextContent;
release.Size = ParseUtil.CoerceLong(
Regex.Match(infoString, "\\((\\d+)\\)").Value.Replace("(", "").Replace(")", ""));
var infosplit = infoString.Replace("/", string.Empty).Split(":".ToCharArray());
release.Seeders = ParseUtil.CoerceInt(infosplit[1]);
release.Peers = release.Seeders + ParseUtil.CoerceInt(infosplit[2]);
release.DownloadVolumeFactor = globalFreeleech ? 0 : 1;
release.UploadVolumeFactor = 1;
release.MinimumRatio = 1;
release.MinimumSeedTime = 172800; // 48 hours
// var tags = row.QuerySelector(".label-tag").TextContent; These don't see to parse - bad tags?
releases.Add(release);
searchUrls.Add(new WebRequest
{
Url = $"{ShowUrl}?{showTorrentsQueryParams.GetQueryString()}",
Type = RequestType.POST,
PostData = showTorrentsBody,
Headers = showTorrentsHeaders
});
}
}
}
catch (Exception ex)
else
searchUrls.Add(new WebRequest { Url = TorrentsUrl, Type = RequestType.GET });
foreach (var searchUrl in searchUrls)
{
OnParseError(results.ContentString, ex);
response = await RequestWithCookiesAsync(url: searchUrl.Url, method: searchUrl.Type, data: searchUrl.PostData, headers: searchUrl.Headers);
response = await ReloginIfNecessaryAsync(response);
try
{
releases.AddRange(ParseResults(response, query, searchTerm, hasGlobalFreeleech));
}
catch (Exception ex)
{
OnParseError(response.ContentString, ex);
}
}
foreach (var release in releases)
release.Category = release.Title.Contains("1080p") || release.Title.Contains("720p")
? new List<int> { TorznabCatType.TVHD.ID }
: new List<int> { TorznabCatType.TVSD.ID };
return releases;
}
private async Task<WebResult> ReloginIfNecessary(WebResult response)
private IList<ReleaseInfo> ParseResults(WebResult response, TorznabQuery query, string searchTerm, bool hasGlobalFreeleech = false)
{
if (response.ContentString.Contains("onclick=\"document.location='logout'\""))
var releases = new List<ReleaseInfo>();
var parser = new HtmlParser();
var dom = parser.ParseDocument(response.ContentString);
if (!hasGlobalFreeleech)
hasGlobalFreeleech = dom.QuerySelector("span:contains(\"Freeleech until:\"):has(span.datetime)") != null;
var publishDate = DateTime.Now;
var rows = dom.QuerySelectorAll("#torrent-table tr.eprow, table tr.eprow");
foreach (var row in rows)
{
var title = ParseTitle(row.QuerySelector("td:nth-of-type(3)"));
if ((query.ImdbID == null || !TorznabCaps.MovieSearchImdbAvailable) && !query.MatchQueryStringAND(title, queryStringOverride: searchTerm))
continue;
var link = new Uri(SiteLink + row.QuerySelector("td:nth-of-type(5) a[href^=\"load_torrent?\"]")?.GetAttribute("href"));
var details = new Uri(SiteLink + row.QuerySelector("td:nth-of-type(5) [href^=\"torrent_info?\"]")?.GetAttribute("href"));
var infoString = row.QuerySelector("td:nth-of-type(4)")?.TextContent.Trim() ?? string.Empty;
var infoRegex = new Regex(@"\((?<size>\d+)\):(?<seeders>\d+) \/ :(?<leechers>\d+)$", RegexOptions.Compiled);
var matchInfo = infoRegex.Match(infoString);
var size = matchInfo.Groups["size"].Success && long.TryParse(matchInfo.Groups["size"].Value, out var outSize) ? outSize : 0;
var seeders = matchInfo.Groups["seeders"].Success && int.TryParse(matchInfo.Groups["seeders"].Value, out var outSeeders) ? outSeeders : 0;
var leechers = matchInfo.Groups["leechers"].Success && int.TryParse(matchInfo.Groups["leechers"].Value, out var outLeechers) ? outLeechers : 0;
var dateTimestamp = row.QuerySelector(".datetime[data-timestamp]")?.GetAttribute("data-timestamp");
publishDate = dateTimestamp != null && ParseUtil.TryCoerceDouble(dateTimestamp, out var timestamp) ? DateTimeUtil.UnixTimestampToDateTime(timestamp) : publishDate.AddMinutes(-1);
var release = new ReleaseInfo
{
Guid = link,
Link = link,
Details = details,
Title = title,
Category = ParseCategories(title),
Size = size,
Seeders = seeders,
Peers = seeders + leechers,
PublishDate = publishDate,
Genres = row.QuerySelectorAll("label.label-tag").Select(t => t.TextContent.Trim()).ToList(),
DownloadVolumeFactor = hasGlobalFreeleech ? 0 : 1,
UploadVolumeFactor = 1,
MinimumRatio = 1,
MinimumSeedTime = 172800 // 48 hours
};
var posterStyle = row.QuerySelector("div[style^=\"cursor: pointer; background-image:url\"]")?.GetAttribute("style");
if (!string.IsNullOrEmpty(posterStyle))
{
var posterStr = Regex.Match(posterStyle, @"url\('(?<poster>.*)'\);").Groups["poster"].Value;
release.Poster = new Uri(SiteLink + posterStr);
}
releases.Add(release);
}
return releases;
}
private static string ParseTitle(IElement titleRow)
{
var title = titleRow?.ChildNodes.First(n => n.NodeType == NodeType.Text && n.TextContent.Trim() != string.Empty);
return title?.TextContent.Trim();
}
private static string FixSearchTerm(string term)
{
term = Regex.Replace(term, @"\b[S|E]\d+\b", string.Empty, RegexOptions.IgnoreCase);
term = Regex.Replace(term, @".+\b\d{4}(\.\d{2}\.\d{2})?\b", string.Empty);
term = Regex.Replace(term, @"[\.\s\(\)\[\]]+", " ");
return term.ToLower().Trim();
}
protected virtual List<int> ParseCategories(string title) => title.Contains("1080p") || title.Contains("1080i") || title.Contains("720p") ? new List<int> { TorznabCatType.TVHD.ID } : new List<int> { TorznabCatType.TVSD.ID };
private async Task<WebResult> ReloginIfNecessaryAsync(WebResult response)
{
if (response.ContentString.Contains("onclick=\"document.location='logout'\"") ||
response.ContentString.Contains("show_id") || response.ContentString.Contains("Filename") ||
response.ContentString.Contains("Peers") || response.ContentString.Contains("Download"))
return response;
logger.Warn("Session expired. Relogin.");
await ApplyConfiguration(null);
response.Request.Cookies = CookieHeader;
return await webclient.GetResultAsync(response.Request);

View file

@ -0,0 +1,31 @@
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using Newtonsoft.Json;
namespace Jackett.Common.Models.IndexerConfig.Bespoke
{
[ExcludeFromCodeCoverage]
internal class ConfigurationDataShazbat : ConfigurationDataBasicLoginWithRSS
{
public SingleSelectConfigurationItem ShowPagesFetchLimit { get; private set; }
public DisplayInfoConfigurationItem ShowPagesFetchLimitInstructions { get; private set; }
public ConfigurationDataShazbat()
{
ShowPagesFetchLimit = new SingleSelectConfigurationItem(
"Show Pages Fetch Limit (sub-requests when searching)",
new Dictionary<string, string>
{
{"1", "1"},
{"2", "2"},
{"3", "3"},
{"4", "4"},
{"5", "5"}
})
{ Value = "2" };
ShowPagesFetchLimitInstructions = new DisplayInfoConfigurationItem("Show Pages Fetch Limit Warning", "Higher values may risk your account being flagged for bot activity when used with automation software such as Sonarr.");
}
}
}

View file

@ -3,7 +3,7 @@ using System.Diagnostics.CodeAnalysis;
namespace Jackett.Common.Models.IndexerConfig.Bespoke
{
[ExcludeFromCodeCoverage]
public class ConfigurationDataSpeedCD : ConfigurationDataBasicLogin
internal class ConfigurationDataSpeedCD : ConfigurationDataBasicLogin
{
public BoolConfigurationItem Freeleech { get; set; }
public BoolConfigurationItem ExcludeArchives { get; set; }