core: fix invalid characters in xml/rss. resolves #9118 (#9636)

This commit is contained in:
Diego Heras 2020-09-25 02:40:13 +02:00 committed by GitHub
parent 61eb75f7e1
commit a1108bc5a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 133 additions and 29 deletions

View File

@ -2,6 +2,7 @@ using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading;
using System.Xml.Linq;
@ -9,10 +10,16 @@ namespace Jackett.Common.Models
{
public class ResultPage
{
private static readonly XNamespace atomNs = "http://www.w3.org/2005/Atom";
private static readonly XNamespace torznabNs = "http://torznab.com/schemas/2015/feed";
private static readonly XNamespace _AtomNs = "http://www.w3.org/2005/Atom";
private static readonly XNamespace _TorznabNs = "http://torznab.com/schemas/2015/feed";
public ChannelInfo ChannelInfo { get; private set; }
// filters control characters but allows only properly-formed surrogate sequences
// https://stackoverflow.com/a/961504
private static readonly Regex _InvalidXmlChars = new Regex(
@"(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F\uFEFF\uFFFE\uFFFF]",
RegexOptions.Compiled);
private ChannelInfo ChannelInfo { get; }
public IEnumerable<ReleaseInfo> Releases { get; set; }
public ResultPage(ChannelInfo channelInfo)
@ -21,15 +28,29 @@ namespace Jackett.Common.Models
Releases = new List<ReleaseInfo>();
}
private string xmlDateFormat(DateTime dt)
/// <summary>
/// removes any unusual unicode characters that can't be encoded into XML (eg 0x1A)
/// </summary>
private static string RemoveInvalidXMLChars(string text)
{
if (text == null)
return null;
return _InvalidXmlChars.Replace(text, "");
}
private static string XmlDateFormat(DateTime dt)
{
Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US");
//Sat, 14 Mar 2015 17:10:42 -0400
var f = string.Format(@"{0:ddd, dd MMM yyyy HH:mm:ss }{1}", dt, string.Format("{0:zzz}", dt).Replace(":", ""));
return f;
return $"{dt:ddd, dd MMM yyyy HH:mm:ss} " + $"{dt:zzz}".Replace(":", "");
}
private XElement getTorznabElement(string name, object value) => value == null ? null : new XElement(torznabNs + "attr", new XAttribute("name", name), new XAttribute("value", value));
private static XElement GetTorznabElement(string name, object value)
{
if (value == null)
return null;
return new XElement(_TorznabNs + "attr", new XAttribute("name", name), new XAttribute("value", value));
}
public string ToXml(Uri selfAtom)
{
@ -39,10 +60,10 @@ namespace Jackett.Common.Models
new XDeclaration("1.0", "UTF-8", null),
new XElement("rss",
new XAttribute("version", "1.0"),
new XAttribute(XNamespace.Xmlns + "atom", atomNs.NamespaceName),
new XAttribute(XNamespace.Xmlns + "torznab", torznabNs.NamespaceName),
new XAttribute(XNamespace.Xmlns + "atom", _AtomNs.NamespaceName),
new XAttribute(XNamespace.Xmlns + "torznab", _TorznabNs.NamespaceName),
new XElement("channel",
new XElement(atomNs + "link",
new XElement(_AtomNs + "link",
new XAttribute("href", selfAtom.AbsoluteUri),
new XAttribute("rel", "self"),
new XAttribute("type", "application/rss+xml")
@ -60,15 +81,15 @@ namespace Jackett.Common.Models
),
from r in Releases
select new XElement("item",
new XElement("title", r.Title),
new XElement("title", RemoveInvalidXMLChars(r.Title)),
new XElement("guid", r.Guid.AbsoluteUri), // GUID and (Link or Magnet) are mandatory
new XElement("jackettindexer", new XAttribute("id", r.Origin.Id), r.Origin.DisplayName),
r.Comments == null ? null : new XElement("comments", r.Comments.AbsoluteUri),
r.PublishDate == DateTime.MinValue ? new XElement("pubDate", xmlDateFormat(DateTime.Now)) : new XElement("pubDate", xmlDateFormat(r.PublishDate)),
r.PublishDate == DateTime.MinValue ? new XElement("pubDate", XmlDateFormat(DateTime.Now)) : new XElement("pubDate", XmlDateFormat(r.PublishDate)),
r.Size == null ? null : new XElement("size", r.Size),
r.Files == null ? null : new XElement("files", r.Files),
r.Grabs == null ? null : new XElement("grabs", r.Grabs),
new XElement("description", r.Description),
new XElement("description", RemoveInvalidXMLChars(r.Description)),
new XElement("link", r.Link?.AbsoluteUri ?? r.MagnetUri.AbsoluteUri),
r.Category == null ? null : from c in r.Category select new XElement("category", c),
new XElement(
@ -77,27 +98,27 @@ namespace Jackett.Common.Models
r.Size == null ? null : new XAttribute("length", r.Size),
new XAttribute("type", "application/x-bittorrent")
),
r.Category == null ? null : from c in r.Category select getTorznabElement("category", c),
getTorznabElement("magneturl", r.MagnetUri?.AbsoluteUri),
getTorznabElement("rageid", r.RageID),
getTorznabElement("thetvdb", r.TVDBId),
getTorznabElement("imdb", r.Imdb == null ? null : ((long)r.Imdb).ToString("D7")),
getTorznabElement("tmdb", r.TMDb),
getTorznabElement("author", r.Author),
getTorznabElement("booktitle", r.BookTitle),
getTorznabElement("seeders", r.Seeders),
getTorznabElement("peers", r.Peers),
getTorznabElement("infohash", r.InfoHash),
getTorznabElement("minimumratio", r.MinimumRatio),
getTorznabElement("minimumseedtime", r.MinimumSeedTime),
getTorznabElement("downloadvolumefactor", r.DownloadVolumeFactor),
getTorznabElement("uploadvolumefactor", r.UploadVolumeFactor)
r.Category == null ? null : from c in r.Category select GetTorznabElement("category", c),
GetTorznabElement("magneturl", r.MagnetUri?.AbsoluteUri),
GetTorznabElement("rageid", r.RageID),
GetTorznabElement("thetvdb", r.TVDBId),
GetTorznabElement("imdb", r.Imdb?.ToString("D7")),
GetTorznabElement("tmdb", r.TMDb),
GetTorznabElement("author", RemoveInvalidXMLChars(r.Author)),
GetTorznabElement("booktitle", RemoveInvalidXMLChars(r.BookTitle)),
GetTorznabElement("seeders", r.Seeders),
GetTorznabElement("peers", r.Peers),
GetTorznabElement("infohash", RemoveInvalidXMLChars(r.InfoHash)),
GetTorznabElement("minimumratio", r.MinimumRatio),
GetTorznabElement("minimumseedtime", r.MinimumSeedTime),
GetTorznabElement("downloadvolumefactor", r.DownloadVolumeFactor),
GetTorznabElement("uploadvolumefactor", r.UploadVolumeFactor)
)
)
)
);
return xdoc.Declaration.ToString() + Environment.NewLine + xdoc.ToString();
return xdoc.Declaration + Environment.NewLine + xdoc;
}
}
}

View File

@ -0,0 +1,83 @@
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Jackett.Common.Indexers;
using Jackett.Common.Models;
using Newtonsoft.Json.Linq;
using NUnit.Framework;
using Assert = NUnit.Framework.Assert;
namespace Jackett.Test.Models
{
class TestIndexer : BaseIndexer
{
public TestIndexer()
: base(id: "test_id",
name: "test_name",
description: "test_description",
link: "https://test.link/",
configService: null,
logger: null,
configData: null,
p: null)
{
}
public override TorznabCapabilities TorznabCaps { get; protected set; }
public override Task<IndexerConfigurationStatus> ApplyConfiguration(JToken configJson) => throw new NotImplementedException();
protected override Task<IEnumerable<ReleaseInfo>> PerformQuery(TorznabQuery query) => throw new NotImplementedException();
}
[TestFixture]
public class ResultPageTests
{
[Test]
public void TestXmlWithInvalidCharacters()
{
// 0x1A can't be represented in XML => https://stackoverflow.com/a/8506173
// some ascii and unicode characters
var text = "Title Ñ 理" + Convert.ToChar("\u001a") + Convert.ToChar("\u2813");
var validText = "Title Ñ 理" + Convert.ToChar("\u2813");
// link with characters that requires URL encode
var link = new Uri("https://example.com/" + text);
var validLink = "https://example.com/Title%20%C3%91%20%E7%90%86%1A%E2%A0%93";
var resultPage = new ResultPage(
new ChannelInfo // characters in channel info are safe because are provided by us
{
Link = link,
ImageUrl = link,
ImageLink = link
})
{
Releases = new List<ReleaseInfo>
{
new ReleaseInfo // these fields are from websites and they can be problematic
{
Title = text,
Guid = link,
Link = link,
Comments = link,
PublishDate = new DateTime(2020, 09, 22),
Description = text,
Author = text,
BookTitle = text,
BannerUrl = link,
InfoHash = text,
MagnetUri = link,
Origin = new TestIndexer()
}
}
};
var xml = resultPage.ToXml(link);
Assert.AreEqual(5, Regex.Matches(xml, validText).Count);
Assert.AreEqual(9, Regex.Matches(xml, validLink).Count);
// this should be in another test but it's here to avoid creating the whole object again
Assert.True(xml.Contains("Tue, 22 Sep 2020 00:00:00 "));
}
}
}