Added method in ParseUtil that strips out every invalid XML character from a string.

Added test for ParseUtil.RemoveInvalidXmlChars() using a snippet of rss from XSeeds that was originally causing problems.
This commit is contained in:
Jay Otterbein 2016-11-03 20:23:12 -05:00 committed by kaso17
parent 72d3f2ea49
commit d1e767bd41
5 changed files with 83 additions and 1 deletions

View File

@ -164,6 +164,7 @@
<Compile Include="TestIIndexerManagerServiceHelper.cs" />
<Compile Include="TestUtil.cs" />
<Compile Include="TestWebClient.cs" />
<Compile Include="Util\ParseUtilTests.cs" />
<Compile Include="Util\ServerUtilTests.cs" />
<Compile Include="Util\TvCategoryParserTests.cs" />
</ItemGroup>
@ -182,6 +183,9 @@
<ItemGroup>
<Folder Include="Indexers\" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Util\Invalid-RSS.xml" />
</ItemGroup>
<Choose>
<When Condition="'$(VisualStudioVersion)' == '10.0' And '$(IsCodedUITest)' == 'True'">
<ItemGroup>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>RSS Syndicator</title>
<link>http://somewebsite.com</link>
<description>
&lt;br /&gt;
Enjoy!&lt;br /&gt;
&lt;br /&gt;
-&lt;br /&gt;
&lt;br /&gt;
 group info&lt;br /&gt;
&lt;br /&gt;
Know Your Role and Shut Your Mouth!&lt;br /&gt;
&lt;br /&gt;
 we are now looking for...&lt;br /&gt;
&lt;br /&gt;
</description>
</channel>
</rss>

View File

@ -0,0 +1,46 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Xml.Linq;
using System.Xml.XPath;
using FluentAssertions;
using Jackett.Utils;
using NUnit.Framework;
namespace JackettTest.Util
{
[TestFixture]
public class ParseUtilTests
{
private static string InvalidRssXml
{
get
{
var type = typeof(ParseUtilTests);
using (var resourceStream = type.Assembly.GetManifestResourceStream($"{type.Namespace}.Invalid-RSS.xml"))
using (var sr = new StreamReader(resourceStream))
{
return sr.ReadToEnd();
}
}
}
[Test]
public void Invalid_RSS_should_parse_after_removing_invalid_chars()
{
var invalidRss = InvalidRssXml;
Action parseAction = () => XDocument.Parse(invalidRss);
parseAction.ShouldThrow<Exception>().WithMessage("'\a', hexadecimal value 0x07, is an invalid character. Line 12, position 7.");
var validRSs = ParseUtil.RemoveInvalidXmlChars(invalidRss);
var rssDoc = XDocument.Parse(validRSs);
rssDoc.Root.Should().NotBeNull();
var description = rssDoc.Root.XPathSelectElement("//description");
description.Value.Should().Contain("Know Your Role and Shut Your Mouth!");
}
}
}

View File

@ -13,6 +13,7 @@ using System.Threading.Tasks;
using Jackett.Models.IndexerConfig;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using static Jackett.Utils.ParseUtil;
namespace Jackett.Indexers
{
@ -128,7 +129,7 @@ namespace Jackett.Indexers
if (rssPage.Content.EndsWith("\0")) {
rssPage.Content = rssPage.Content.Substring(0, rssPage.Content.Length - 1);
}
rssPage.Content = rssPage.Content.Replace("\0x10", "").Replace("\0x07", "");
rssPage.Content = RemoveInvalidXmlChars(rssPage.Content);
var rssDoc = XDocument.Parse(rssPage.Content);
foreach (var item in rssDoc.Descendants("item"))

View File

@ -1,9 +1,15 @@
using System.Globalization;
using System.Text.RegularExpressions;
namespace Jackett.Utils
{
public static class ParseUtil
{
private static readonly Regex InvalidXmlChars =
new Regex(
@"(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F\uFEFF\uFFFE\uFFFF]",
RegexOptions.Compiled);
public static string NormalizeSpace(string s)
{
return s.Trim();
@ -17,6 +23,11 @@ namespace Jackett.Utils
return normalized;
}
public static string RemoveInvalidXmlChars(string text)
{
return string.IsNullOrEmpty(text) ? "" : InvalidXmlChars.Replace(text, "");
}
public static double CoerceDouble(string str)
{
return double.Parse(NormalizeNumber(str), NumberStyles.Any, CultureInfo.InvariantCulture);