Simplified normalizing text

This commit is contained in:
M66B 2021-04-29 19:11:11 +02:00
parent 4a216a719a
commit 3ba8e97d56
1 changed files with 16 additions and 40 deletions

View File

@ -134,8 +134,6 @@ public class HtmlHelper {
private static final int SMALL_IMAGE_SIZE = 5; // pixels
private static final int TRACKING_PIXEL_SURFACE = 25; // pixels
private static final float[] HEADING_SIZES = {1.5f, 1.4f, 1.3f, 1.2f, 1.1f, 1f};
private static String WHITESPACE = " \t\f";
private static String WHITESPACE_NL = WHITESPACE + "\r\n";
private static final String LINE = "----------------------------------------";
private static final HashMap<String, Integer> x11ColorMap = new HashMap<>();
@ -2038,8 +2036,7 @@ public class HtmlHelper {
private int plain = 0;
private List<TextNode> block = new ArrayList<>();
private Pattern TRIM_WHITESPACE_NL =
Pattern.compile("[" + WHITESPACE + "]*\\r?\\n[" + WHITESPACE + "]*");
private final Pattern FOLD_WHITESPACE = Pattern.compile("[ \t\f\r\n]+");
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
@ -2079,37 +2076,27 @@ public class HtmlHelper {
// https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace
TextNode tnode;
String text;
int index;
for (int i = 0; i < block.size(); ) {
tnode = block.get(i);
text = tnode.getWholeText();
// Remove whitespace before/after newlines
text = TRIM_WHITESPACE_NL.matcher(text).replaceAll(" ");
if ("-- ".equals(text)) {
tnode.text(text);
i++;
continue;
}
// Remove leading whitespace
if (i == 0 || endsWithWhitespace(block.get(i - 1).text())) {
index = 0;
while (isWhiteSpace(text, index))
index++;
// Fold white space
text = FOLD_WHITESPACE.matcher(text).replaceAll(" ");
if (index > 0)
text = text.substring(index);
}
// Conditionally remove leading whitespace
if (isSpace(text, 0) &&
(i == 0 || endsWithSpace(block.get(i - 1).text())))
text = text.substring(1);
// Remove multiple trailing whitespace
index = text.length() - 1;
while (isWhiteSpace(text, index) &&
(isWhiteSpace(text, index - 1) || i == block.size() - 1))
index--;
text = text.substring(0, index + 1);
// Conditionally remove trailing whitespace
if (i == block.size() - 1 && endsWithSpace(text))
text = text.substring(0, text.length() - 1);
tnode.text(text);
@ -2119,23 +2106,13 @@ public class HtmlHelper {
i++;
}
// Remove last trailing whitespace
if (block.size() > 0) {
tnode = block.get(block.size() - 1);
text = tnode.getWholeText();
if (!"-- ".equals(text) && endsWithWhitespace(text)) {
text = text.substring(0, text.length() - 1);
tnode.text(text);
}
}
// Remove blank blocks
// Remove all blank blocks
boolean blank = true;
for (int i = 0; i < block.size(); i++) {
text = block.get(i).getWholeText();
for (int j = 0; j < text.length(); j++) {
char kar = text.charAt(j);
if (WHITESPACE.indexOf(kar) < 0) {
if (kar == ' ') {
blank = false;
break;
}
@ -2155,15 +2132,14 @@ public class HtmlHelper {
}
}
boolean isWhiteSpace(String text, int index) {
boolean isSpace(String text, int index) {
if (index < 0 || index >= text.length())
return false;
char kar = text.charAt(index);
return (WHITESPACE_NL.indexOf(kar) >= 0);
return (text.charAt(index) == ' ');
}
boolean endsWithWhitespace(String text) {
return isWhiteSpace(text, text.length() - 1);
boolean endsWithSpace(String text) {
return isSpace(text, text.length() - 1);
}
}, document.body());