From a28cc1a0f9df62c3f504118229b1218cdf05acaf Mon Sep 17 00:00:00 2001 From: M66B Date: Tue, 12 Mar 2019 09:04:16 +0000 Subject: [PATCH] Improved html to text --- .../eu/faircode/email/AdapterMessage.java | 6 +- .../java/eu/faircode/email/HtmlHelper.java | 92 +++++++++++-------- 2 files changed, 60 insertions(+), 38 deletions(-) diff --git a/app/src/main/java/eu/faircode/email/AdapterMessage.java b/app/src/main/java/eu/faircode/email/AdapterMessage.java index 77e3dad83c..672a47fc62 100644 --- a/app/src/main/java/eu/faircode/email/AdapterMessage.java +++ b/app/src/main/java/eu/faircode/email/AdapterMessage.java @@ -1667,7 +1667,11 @@ public class AdapterMessage extends RecyclerView.Adapter"; + + return HtmlHelper.fromHtml(html, new Html.ImageGetter() { @Override public Drawable getDrawable(String source) { Drawable image = HtmlHelper.decodeImage(source, context, message.id, show_images); diff --git a/app/src/main/java/eu/faircode/email/HtmlHelper.java b/app/src/main/java/eu/faircode/email/HtmlHelper.java index 6f593eef84..8aa3d91390 100644 --- a/app/src/main/java/eu/faircode/email/HtmlHelper.java +++ b/app/src/main/java/eu/faircode/email/HtmlHelper.java @@ -38,6 +38,7 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; +import org.jsoup.safety.Cleaner; import org.jsoup.safety.Whitelist; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; @@ -100,12 +101,16 @@ public class HtmlHelper { } static String sanitize(Context context, String html, boolean showQuotes) { - final Document document = Jsoup.parse(Jsoup.clean(html, Whitelist - .relaxed() + Document parsed = Jsoup.parse(html); + Whitelist whitelist = Whitelist.relaxed() .addTags("hr") .removeTags("col", "colgroup", "thead", "tbody") + .removeAttributes("table", "width") + .removeAttributes("td", "colspan", "rowspan", "width") + .removeAttributes("th", "colspan", "rowspan", "width") .addProtocols("img", "src", "cid") - .addProtocols("img", "src", "data"))); + .addProtocols("img", "src", "data"); + final Document document = new Cleaner(whitelist).clean(parsed); // Quotes if (!showQuotes) @@ -115,12 +120,12 @@ public class HtmlHelper { // Tables for (Element col : document.select("th,td")) { // prevent line breaks - col.select("br").tagName("span").html(" "); + col.select("br").tagName("span").html(" "); col.select("div").tagName("span"); // separate columns by a space if (col.nextElementSibling() != null) - col.append(" "); + col.append(" "); if ("th".equals(col.tagName())) col.tagName("strong"); @@ -180,6 +185,7 @@ public class HtmlHelper { for (Element img : document.select("img")) { String src = img.attr("src"); String alt = img.attr("alt"); + String title = img.attr("title"); String height = img.attr("height").trim(); String width = img.attr("width").trim(); @@ -210,6 +216,10 @@ public class HtmlHelper { div.appendElement("br"); div.appendElement("em").text(alt); } + if (!TextUtils.isEmpty(title)) { + div.appendElement("br"); + div.appendElement("em").text(title); + } // Tracking image if ("1".equals(height) && "1".equals(width) && !TextUtils.isEmpty(src)) { @@ -226,51 +236,59 @@ public class HtmlHelper { public void head(Node node, int depth) { if (node instanceof TextNode) { TextNode tnode = (TextNode) node; - Element span = document.createElement("span"); - int pos = 0; String text = tnode.text(); Matcher matcher = PatternsCompat.WEB_URL.matcher(text); - while (matcher.find()) { - boolean linked = false; - Node parent = tnode.parent(); - while (parent != null) { - if ("a".equals(parent.nodeName())) { - linked = true; - break; + if (matcher.matches()) { + Element span = document.createElement("span"); + + int pos = 0; + while (matcher.find()) { + boolean linked = false; + Node parent = tnode.parent(); + while (parent != null) { + if ("a".equals(parent.nodeName())) { + linked = true; + break; + } + parent = parent.parent(); } - parent = parent.parent(); + + String scheme = Uri.parse(matcher.group()).getScheme(); + + if (BuildConfig.DEBUG) + Log.i("Web url=" + matcher.group() + " linked=" + linked + " scheme=" + scheme); + + if (linked || scheme == null) + span.appendText(text.substring(pos, matcher.end())); + else { + span.appendText(text.substring(pos, matcher.start())); + + Element a = document.createElement("a"); + a.attr("href", matcher.group()); + a.text(matcher.group()); + span.appendChild(a); + } + + pos = matcher.end(); } + span.appendText(text.substring(pos)); - String scheme = Uri.parse(matcher.group()).getScheme(); - - if (BuildConfig.DEBUG) - Log.i("Web url=" + matcher.group() + " linked=" + linked + " scheme=" + scheme); - - if (linked || scheme == null) - span.appendText(text.substring(pos, matcher.end())); - else { - span.appendText(text.substring(pos, matcher.start())); - - Element a = document.createElement("a"); - a.attr("href", matcher.group()); - a.text(matcher.group()); - span.appendChild(a); - } - - pos = matcher.end(); + tnode.before(span); + tnode.text(""); } - span.appendText(text.substring(pos)); - - tnode.before(span); - tnode.text(""); } } @Override public void tail(Node node, int depth) { } - }, document.body()); + }, document); + + // Remove block elements displaying nothing + for (Element e : document.select("*")) + if (e.isBlock() && !e.hasText() && e.select("img").size() == 0) + e.remove(); return document.body().html(); }