diff --git a/src/Jackett.Common/Models/ResultPage.cs b/src/Jackett.Common/Models/ResultPage.cs index 2c87806d7..38759e7c8 100644 --- a/src/Jackett.Common/Models/ResultPage.cs +++ b/src/Jackett.Common/Models/ResultPage.cs @@ -2,6 +2,7 @@ using System; using System.Collections.Generic; using System.Globalization; using System.Linq; +using System.Text.RegularExpressions; using System.Threading; using System.Xml.Linq; @@ -9,10 +10,16 @@ namespace Jackett.Common.Models { public class ResultPage { - private static readonly XNamespace atomNs = "http://www.w3.org/2005/Atom"; - private static readonly XNamespace torznabNs = "http://torznab.com/schemas/2015/feed"; + private static readonly XNamespace _AtomNs = "http://www.w3.org/2005/Atom"; + private static readonly XNamespace _TorznabNs = "http://torznab.com/schemas/2015/feed"; - public ChannelInfo ChannelInfo { get; private set; } + // filters control characters but allows only properly-formed surrogate sequences + // https://stackoverflow.com/a/961504 + private static readonly Regex _InvalidXmlChars = new Regex( + @"(? Releases { get; set; } public ResultPage(ChannelInfo channelInfo) @@ -21,15 +28,29 @@ namespace Jackett.Common.Models Releases = new List(); } - private string xmlDateFormat(DateTime dt) + /// + /// removes any unusual unicode characters that can't be encoded into XML (eg 0x1A) + /// + private static string RemoveInvalidXMLChars(string text) + { + if (text == null) + return null; + return _InvalidXmlChars.Replace(text, ""); + } + + private static string XmlDateFormat(DateTime dt) { Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US"); //Sat, 14 Mar 2015 17:10:42 -0400 - var f = string.Format(@"{0:ddd, dd MMM yyyy HH:mm:ss }{1}", dt, string.Format("{0:zzz}", dt).Replace(":", "")); - return f; + return $"{dt:ddd, dd MMM yyyy HH:mm:ss} " + $"{dt:zzz}".Replace(":", ""); } - private XElement getTorznabElement(string name, object value) => value == null ? null : new XElement(torznabNs + "attr", new XAttribute("name", name), new XAttribute("value", value)); + private static XElement GetTorznabElement(string name, object value) + { + if (value == null) + return null; + return new XElement(_TorznabNs + "attr", new XAttribute("name", name), new XAttribute("value", value)); + } public string ToXml(Uri selfAtom) { @@ -39,10 +60,10 @@ namespace Jackett.Common.Models new XDeclaration("1.0", "UTF-8", null), new XElement("rss", new XAttribute("version", "1.0"), - new XAttribute(XNamespace.Xmlns + "atom", atomNs.NamespaceName), - new XAttribute(XNamespace.Xmlns + "torznab", torznabNs.NamespaceName), + new XAttribute(XNamespace.Xmlns + "atom", _AtomNs.NamespaceName), + new XAttribute(XNamespace.Xmlns + "torznab", _TorznabNs.NamespaceName), new XElement("channel", - new XElement(atomNs + "link", + new XElement(_AtomNs + "link", new XAttribute("href", selfAtom.AbsoluteUri), new XAttribute("rel", "self"), new XAttribute("type", "application/rss+xml") @@ -60,15 +81,15 @@ namespace Jackett.Common.Models ), from r in Releases select new XElement("item", - new XElement("title", r.Title), + new XElement("title", RemoveInvalidXMLChars(r.Title)), new XElement("guid", r.Guid.AbsoluteUri), // GUID and (Link or Magnet) are mandatory new XElement("jackettindexer", new XAttribute("id", r.Origin.Id), r.Origin.DisplayName), r.Comments == null ? null : new XElement("comments", r.Comments.AbsoluteUri), - r.PublishDate == DateTime.MinValue ? new XElement("pubDate", xmlDateFormat(DateTime.Now)) : new XElement("pubDate", xmlDateFormat(r.PublishDate)), + r.PublishDate == DateTime.MinValue ? new XElement("pubDate", XmlDateFormat(DateTime.Now)) : new XElement("pubDate", XmlDateFormat(r.PublishDate)), r.Size == null ? null : new XElement("size", r.Size), r.Files == null ? null : new XElement("files", r.Files), r.Grabs == null ? null : new XElement("grabs", r.Grabs), - new XElement("description", r.Description), + new XElement("description", RemoveInvalidXMLChars(r.Description)), new XElement("link", r.Link?.AbsoluteUri ?? r.MagnetUri.AbsoluteUri), r.Category == null ? null : from c in r.Category select new XElement("category", c), new XElement( @@ -77,27 +98,27 @@ namespace Jackett.Common.Models r.Size == null ? null : new XAttribute("length", r.Size), new XAttribute("type", "application/x-bittorrent") ), - r.Category == null ? null : from c in r.Category select getTorznabElement("category", c), - getTorznabElement("magneturl", r.MagnetUri?.AbsoluteUri), - getTorznabElement("rageid", r.RageID), - getTorznabElement("thetvdb", r.TVDBId), - getTorznabElement("imdb", r.Imdb == null ? null : ((long)r.Imdb).ToString("D7")), - getTorznabElement("tmdb", r.TMDb), - getTorznabElement("author", r.Author), - getTorznabElement("booktitle", r.BookTitle), - getTorznabElement("seeders", r.Seeders), - getTorznabElement("peers", r.Peers), - getTorznabElement("infohash", r.InfoHash), - getTorznabElement("minimumratio", r.MinimumRatio), - getTorznabElement("minimumseedtime", r.MinimumSeedTime), - getTorznabElement("downloadvolumefactor", r.DownloadVolumeFactor), - getTorznabElement("uploadvolumefactor", r.UploadVolumeFactor) + r.Category == null ? null : from c in r.Category select GetTorznabElement("category", c), + GetTorznabElement("magneturl", r.MagnetUri?.AbsoluteUri), + GetTorznabElement("rageid", r.RageID), + GetTorznabElement("thetvdb", r.TVDBId), + GetTorznabElement("imdb", r.Imdb?.ToString("D7")), + GetTorznabElement("tmdb", r.TMDb), + GetTorznabElement("author", RemoveInvalidXMLChars(r.Author)), + GetTorznabElement("booktitle", RemoveInvalidXMLChars(r.BookTitle)), + GetTorznabElement("seeders", r.Seeders), + GetTorznabElement("peers", r.Peers), + GetTorznabElement("infohash", RemoveInvalidXMLChars(r.InfoHash)), + GetTorznabElement("minimumratio", r.MinimumRatio), + GetTorznabElement("minimumseedtime", r.MinimumSeedTime), + GetTorznabElement("downloadvolumefactor", r.DownloadVolumeFactor), + GetTorznabElement("uploadvolumefactor", r.UploadVolumeFactor) ) ) ) ); - return xdoc.Declaration.ToString() + Environment.NewLine + xdoc.ToString(); + return xdoc.Declaration + Environment.NewLine + xdoc; } } } diff --git a/src/Jackett.Test/Models/ResultPageTests.cs b/src/Jackett.Test/Models/ResultPageTests.cs new file mode 100644 index 000000000..574313593 --- /dev/null +++ b/src/Jackett.Test/Models/ResultPageTests.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Jackett.Common.Indexers; +using Jackett.Common.Models; +using Newtonsoft.Json.Linq; +using NUnit.Framework; +using Assert = NUnit.Framework.Assert; + +namespace Jackett.Test.Models +{ + class TestIndexer : BaseIndexer + { + public TestIndexer() + : base(id: "test_id", + name: "test_name", + description: "test_description", + link: "https://test.link/", + configService: null, + logger: null, + configData: null, + p: null) + { + } + + public override TorznabCapabilities TorznabCaps { get; protected set; } + public override Task ApplyConfiguration(JToken configJson) => throw new NotImplementedException(); + protected override Task> PerformQuery(TorznabQuery query) => throw new NotImplementedException(); + } + + [TestFixture] + public class ResultPageTests + { + [Test] + public void TestXmlWithInvalidCharacters() + { + // 0x1A can't be represented in XML => https://stackoverflow.com/a/8506173 + // some ascii and unicode characters + var text = "Title Ñ 理" + Convert.ToChar("\u001a") + Convert.ToChar("\u2813"); + var validText = "Title Ñ 理" + Convert.ToChar("\u2813"); + + // link with characters that requires URL encode + var link = new Uri("https://example.com/" + text); + var validLink = "https://example.com/Title%20%C3%91%20%E7%90%86%1A%E2%A0%93"; + + var resultPage = new ResultPage( + new ChannelInfo // characters in channel info are safe because are provided by us + { + Link = link, + ImageUrl = link, + ImageLink = link + }) + { + Releases = new List + { + new ReleaseInfo // these fields are from websites and they can be problematic + { + Title = text, + Guid = link, + Link = link, + Comments = link, + PublishDate = new DateTime(2020, 09, 22), + Description = text, + Author = text, + BookTitle = text, + BannerUrl = link, + InfoHash = text, + MagnetUri = link, + Origin = new TestIndexer() + } + } + }; + var xml = resultPage.ToXml(link); + + Assert.AreEqual(5, Regex.Matches(xml, validText).Count); + Assert.AreEqual(9, Regex.Matches(xml, validLink).Count); + + // this should be in another test but it's here to avoid creating the whole object again + Assert.True(xml.Contains("Tue, 22 Sep 2020 00:00:00 ")); + } + } +}