using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; using NLog; using NzbDrone.Common.Extensions; using NzbDrone.Common.Instrumentation; using NzbDrone.Core.Languages; namespace NzbDrone.Core.Parser { public static class LanguageParser { private static readonly Logger Logger = NzbDroneLogger.GetLogger(typeof(LanguageParser)); private static readonly RegexReplace[] CleanSeriesTitleRegex = new[] { new RegexReplace(@".*?[_. ](S\d{2}(?:E\d{2,4})*[_. ].*)", "$1", RegexOptions.Compiled | RegexOptions.IgnoreCase) }; private static readonly Regex LanguageRegex = new Regex(@"(?:\W|_)(?\b(?:ita|italian)\b)|(?german\b|videomann|ger[. ]dub)|(?flemish)|(?greek)|(?(?:\W|_)(?:FR|VF|VF2|VFF|VFQ|TRUEFRENCH)(?:\W|_))|(?\brus\b)|(?\b(?:HUNDUB|HUN)\b)|(?\bHebDub\b)|(?\b(?:PL\W?DUB|DUB\W?PL|LEK\W?PL|PL\W?LEK)\b)|(?\[(?:CH[ST]|BIG5|GB)\]|简|繁|字幕)|(?\bbgaudio\b)|(?\b(?:español|castellano)\b)|(?\b(?:ukr)\b)", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex CaseSensitiveLanguageRegex = new Regex(@"(?:(?i)(?\bLT\b)|(?\bCZ\b)|(?\bPL\b)|(?\bBG\b))(?:(?i)(?![\W|_|^]SUB))", RegexOptions.Compiled); private static readonly Regex SubtitleLanguageRegex = new Regex(".+?[-_. ](?[a-z]{2,3})([-_. ](?full|forced|foreign|default|cc|psdh|sdh))*$", RegexOptions.Compiled | RegexOptions.IgnoreCase); public static Language ParseLanguage(string title, bool defaultToEnglish = true) { foreach (var regex in CleanSeriesTitleRegex) { if (regex.TryReplace(ref title)) { break; } } var lowerTitle = title.ToLower(); if (lowerTitle.Contains("french")) { return Language.French; } if (lowerTitle.Contains("spanish")) { return Language.Spanish; } if (lowerTitle.Contains("danish")) { return Language.Danish; } if (lowerTitle.Contains("dutch")) { return Language.Dutch; } if (lowerTitle.Contains("japanese")) { return Language.Japanese; } if (lowerTitle.Contains("icelandic")) { return Language.Icelandic; } if (lowerTitle.Contains("mandarin") || lowerTitle.Contains("cantonese") || lowerTitle.Contains("chinese")) { return Language.Chinese; } if (lowerTitle.Contains("korean")) { return Language.Korean; } if (lowerTitle.Contains("russian")) { return Language.Russian; } if (lowerTitle.Contains("polish")) { return Language.Polish; } if (lowerTitle.Contains("vietnamese")) { return Language.Vietnamese; } if (lowerTitle.Contains("swedish")) { return Language.Swedish; } if (lowerTitle.Contains("norwegian")) { return Language.Norwegian; } if (lowerTitle.Contains("finnish")) { return Language.Finnish; } if (lowerTitle.Contains("turkish")) { return Language.Turkish; } if (lowerTitle.Contains("portuguese")) { return Language.Portuguese; } if (lowerTitle.Contains("hungarian")) { return Language.Hungarian; } if (lowerTitle.Contains("hebrew")) { return Language.Hebrew; } if (lowerTitle.Contains("arabic")) { return Language.Arabic; } if (lowerTitle.Contains("hindi")) { return Language.Hindi; } if (lowerTitle.Contains("malayalam")) { return Language.Malayalam; } if (lowerTitle.Contains("ukrainian")) { return Language.Ukrainian; } if (lowerTitle.Contains("bulgarian")) { return Language.Bulgarian; } var regexLanguage = RegexLanguage(title); if (regexLanguage != Language.Unknown) { return regexLanguage; } if (lowerTitle.Contains("english")) { return Language.English; } return defaultToEnglish ? Language.English : Language.Unknown; } public static Language ParseSubtitleLanguage(string fileName) { try { Logger.Debug("Parsing language from subtitle file: {0}", fileName); var simpleFilename = Path.GetFileNameWithoutExtension(fileName); var languageMatch = SubtitleLanguageRegex.Match(simpleFilename); if (languageMatch.Success) { var isoCode = languageMatch.Groups["iso_code"].Value; var isoLanguage = IsoLanguages.Find(isoCode.ToLower()); return isoLanguage?.Language ?? Language.Unknown; } foreach (Language language in Language.All) { if (simpleFilename.EndsWith(language.ToString(), StringComparison.OrdinalIgnoreCase)) { return language; } } Logger.Debug("Unable to parse language from subtitle file: {0}", fileName); } catch (Exception ex) { Logger.Debug(ex, "Failed parsing language from subtitle file: {0}", fileName); } return Language.Unknown; } public static IEnumerable ParseLanguageTags(string fileName) { try { var simpleFilename = Path.GetFileNameWithoutExtension(fileName); var match = SubtitleLanguageRegex.Match(simpleFilename); var languageTags = match.Groups["tags"].Captures.Cast() .Where(tag => !tag.Value.Empty()) .Select(tag => tag.Value.ToLower()); return languageTags; } catch (Exception ex) { Logger.Debug(ex, "Failed parsing language tags from subtitle file: {0}", fileName); } return Enumerable.Empty(); } private static Language RegexLanguage(string title) { // Case sensitive var caseSensitiveMatch = CaseSensitiveLanguageRegex.Match(title); if (caseSensitiveMatch.Groups["lithuanian"].Captures.Cast().Any()) { return Language.Lithuanian; } if (caseSensitiveMatch.Groups["czech"].Captures.Cast().Any()) { return Language.Czech; } if (caseSensitiveMatch.Groups["polish"].Captures.Cast().Any()) { return Language.Polish; } if (caseSensitiveMatch.Groups["bulgarian"].Captures.Cast().Any()) { return Language.Bulgarian; } // Case insensitive var match = LanguageRegex.Match(title); if (match.Groups["italian"].Captures.Cast().Any()) { return Language.Italian; } if (match.Groups["german"].Captures.Cast().Any()) { return Language.German; } if (match.Groups["flemish"].Captures.Cast().Any()) { return Language.Flemish; } if (match.Groups["greek"].Captures.Cast().Any()) { return Language.Greek; } if (match.Groups["french"].Success) { return Language.French; } if (match.Groups["russian"].Success) { return Language.Russian; } if (match.Groups["dutch"].Success) { return Language.Dutch; } if (match.Groups["hungarian"].Success) { return Language.Hungarian; } if (match.Groups["hebrew"].Success) { return Language.Hebrew; } if (match.Groups["polish"].Success) { return Language.Polish; } if (match.Groups["chinese"].Success) { return Language.Chinese; } if (match.Groups["bulgarian"].Success) { return Language.Bulgarian; } if (match.Groups["ukrainian"].Success) { return Language.Ukrainian; } if (match.Groups["spanish"].Success) { return Language.Spanish; } return Language.Unknown; } } }