mirror of https://github.com/Jackett/Jackett
Added method in ParseUtil that strips out every invalid XML character from a string.
Added test for ParseUtil.RemoveInvalidXmlChars() using a snippet of rss from XSeeds that was originally causing problems.
This commit is contained in:
parent
72d3f2ea49
commit
d1e767bd41
|
@ -164,6 +164,7 @@
|
|||
<Compile Include="TestIIndexerManagerServiceHelper.cs" />
|
||||
<Compile Include="TestUtil.cs" />
|
||||
<Compile Include="TestWebClient.cs" />
|
||||
<Compile Include="Util\ParseUtilTests.cs" />
|
||||
<Compile Include="Util\ServerUtilTests.cs" />
|
||||
<Compile Include="Util\TvCategoryParserTests.cs" />
|
||||
</ItemGroup>
|
||||
|
@ -182,6 +183,9 @@
|
|||
<ItemGroup>
|
||||
<Folder Include="Indexers\" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<EmbeddedResource Include="Util\Invalid-RSS.xml" />
|
||||
</ItemGroup>
|
||||
<Choose>
|
||||
<When Condition="'$(VisualStudioVersion)' == '10.0' And '$(IsCodedUITest)' == 'True'">
|
||||
<ItemGroup>
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>RSS Syndicator</title>
|
||||
<link>http://somewebsite.com</link>
|
||||
<description>
|
||||
<br />
|
||||
Enjoy!<br />
|
||||
<br />
|
||||
-<br />
|
||||
<br />
|
||||
group info<br />
|
||||
<br />
|
||||
Know Your Role and Shut Your Mouth!<br />
|
||||
<br />
|
||||
we are now looking for...<br />
|
||||
<br />
|
||||
</description>
|
||||
</channel>
|
||||
</rss>
|
|
@ -0,0 +1,46 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using System.Xml.Linq;
|
||||
using System.Xml.XPath;
|
||||
using FluentAssertions;
|
||||
using Jackett.Utils;
|
||||
using NUnit.Framework;
|
||||
|
||||
namespace JackettTest.Util
|
||||
{
|
||||
[TestFixture]
|
||||
public class ParseUtilTests
|
||||
{
|
||||
private static string InvalidRssXml
|
||||
{
|
||||
get
|
||||
{
|
||||
var type = typeof(ParseUtilTests);
|
||||
using (var resourceStream = type.Assembly.GetManifestResourceStream($"{type.Namespace}.Invalid-RSS.xml"))
|
||||
using (var sr = new StreamReader(resourceStream))
|
||||
{
|
||||
return sr.ReadToEnd();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void Invalid_RSS_should_parse_after_removing_invalid_chars()
|
||||
{
|
||||
var invalidRss = InvalidRssXml;
|
||||
Action parseAction = () => XDocument.Parse(invalidRss);
|
||||
parseAction.ShouldThrow<Exception>().WithMessage("'\a', hexadecimal value 0x07, is an invalid character. Line 12, position 7.");
|
||||
|
||||
var validRSs = ParseUtil.RemoveInvalidXmlChars(invalidRss);
|
||||
var rssDoc = XDocument.Parse(validRSs);
|
||||
rssDoc.Root.Should().NotBeNull();
|
||||
var description = rssDoc.Root.XPathSelectElement("//description");
|
||||
description.Value.Should().Contain("Know Your Role and Shut Your Mouth!");
|
||||
}
|
||||
}
|
||||
}
|
|
@ -13,6 +13,7 @@ using System.Threading.Tasks;
|
|||
using Jackett.Models.IndexerConfig;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Xml.Linq;
|
||||
using static Jackett.Utils.ParseUtil;
|
||||
|
||||
namespace Jackett.Indexers
|
||||
{
|
||||
|
@ -128,7 +129,7 @@ namespace Jackett.Indexers
|
|||
if (rssPage.Content.EndsWith("\0")) {
|
||||
rssPage.Content = rssPage.Content.Substring(0, rssPage.Content.Length - 1);
|
||||
}
|
||||
rssPage.Content = rssPage.Content.Replace("\0x10", "").Replace("\0x07", "");
|
||||
rssPage.Content = RemoveInvalidXmlChars(rssPage.Content);
|
||||
var rssDoc = XDocument.Parse(rssPage.Content);
|
||||
|
||||
foreach (var item in rssDoc.Descendants("item"))
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
using System.Globalization;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace Jackett.Utils
|
||||
{
|
||||
public static class ParseUtil
|
||||
{
|
||||
private static readonly Regex InvalidXmlChars =
|
||||
new Regex(
|
||||
@"(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F\uFEFF\uFFFE\uFFFF]",
|
||||
RegexOptions.Compiled);
|
||||
|
||||
public static string NormalizeSpace(string s)
|
||||
{
|
||||
return s.Trim();
|
||||
|
@ -17,6 +23,11 @@ namespace Jackett.Utils
|
|||
return normalized;
|
||||
}
|
||||
|
||||
public static string RemoveInvalidXmlChars(string text)
|
||||
{
|
||||
return string.IsNullOrEmpty(text) ? "" : InvalidXmlChars.Replace(text, "");
|
||||
}
|
||||
|
||||
public static double CoerceDouble(string str)
|
||||
{
|
||||
return double.Parse(NormalizeNumber(str), NumberStyles.Any, CultureInfo.InvariantCulture);
|
||||
|
|
Loading…
Reference in New Issue