Procházet zdrojové kódy

no more hard-coding UTF-8, detect the charset from the response and use that

tags/v1.2.4
Jonathan Cobb před 4 roky
rodič
revize
fe2332b427
21 změnil soubory, kde provedl 8118 přidání a 38 odebrání
  1. +13
    -10
      bubble-server/src/main/java/bubble/rule/AbstractAppRuleDriver.java
  2. +5
    -4
      bubble-server/src/main/java/bubble/rule/AppRuleDriver.java
  3. +5
    -4
      bubble-server/src/main/java/bubble/rule/bblock/BubbleBlockRuleDriver.java
  4. +3
    -2
      bubble-server/src/main/java/bubble/rule/social/block/JsUserBlockerRuleDriver.java
  5. +4
    -4
      bubble-server/src/main/java/bubble/rule/social/block/UserBlockerRuleDriver.java
  6. +37
    -9
      bubble-server/src/main/java/bubble/service/stream/ActiveStreamState.java
  7. +16
    -2
      bubble-server/src/main/java/bubble/service/stream/StandardRuleEngineService.java
  8. +11
    -0
      bubble-server/src/main/java/bubble/service/stream/StreamConstants.java
  9. +27
    -0
      bubble-server/src/main/java/bubble/service/stream/charset/BubbleCharSet.java
  10. +23
    -0
      bubble-server/src/main/java/bubble/service/stream/charset/CharsetDetector.java
  11. +43
    -0
      bubble-server/src/main/java/bubble/service/stream/charset/HtmlCharsetDetector.java
  12. +92
    -0
      bubble-server/src/main/java/bubble/service/stream/charset/HtmlStreamCharsetDetector.java
  13. +138
    -0
      bubble-server/src/test/java/bubble/test/filter/CharsetDetectionTest.java
  14. +21
    -0
      bubble-server/src/test/java/bubble/test/filter/PassthruDriver.java
  15. +2552
    -0
      bubble-server/src/test/resources/charset_detection/equiv-windows-1250.html
  16. +2569
    -0
      bubble-server/src/test/resources/charset_detection/meta-windows-1250-late.html
  17. +2554
    -0
      bubble-server/src/test/resources/charset_detection/meta-windows-1250.html
  18. +2
    -0
      pom.xml
  19. +1
    -1
      utils/cobbzilla-parent
  20. +1
    -1
      utils/cobbzilla-utils
  21. +1
    -1
      utils/cobbzilla-wizard

+ 13
- 10
bubble-server/src/main/java/bubble/rule/AbstractAppRuleDriver.java Zobrazit soubor

@@ -36,6 +36,7 @@ import org.springframework.beans.factory.annotation.Autowired;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -51,7 +52,6 @@ import static org.cobbzilla.util.io.FileUtil.basename;
import static org.cobbzilla.util.io.regex.RegexReplacementFilter.DEFAULT_PREFIX_REPLACEMENT_WITH_MATCH;
import static org.cobbzilla.util.json.JsonUtil.json;
import static org.cobbzilla.util.security.ShaUtil.sha256_hex;
import static org.cobbzilla.util.string.StringUtil.UTF8cs;

public abstract class AbstractAppRuleDriver implements AppRuleDriver {

@@ -105,19 +105,21 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver {
}

public static final String DEFAULT_INSERTION_REGEX = "<\\s*head[^>]*>";
public static final String DEFAULT_SCRIPT_OPEN = "<meta charset=\"UTF-8\"><script>";
public static final String CHARSET_VAR = "{{charset}}";
public static final String DEFAULT_SCRIPT_OPEN = "<meta charset=\""+CHARSET_VAR+"\"><script>";
public static final String NONCE_VAR = "{{nonce}}";
public static final String DEFAULT_SCRIPT_NONCE_OPEN = "<meta charset=\"UTF-8\"><script nonce=\""+NONCE_VAR+"\">";
public static final String DEFAULT_SCRIPT_NONCE_OPEN = "<meta charset=\""+CHARSET_VAR+"\"><script nonce=\""+NONCE_VAR+"\">";
public static final String DEFAULT_SCRIPT_CLOSE = "</script>";

protected static String insertionRegex (String customRegex) {
return empty(customRegex) ? DEFAULT_INSERTION_REGEX : customRegex;
}

protected static String scriptOpen (FilterHttpRequest filterRequest, String customNonceOpen, String customNoNonceOpen) {
return filterRequest.hasScriptNonce()
protected static String scriptOpen (FilterHttpRequest filterRequest, String charset, String customNonceOpen, String customNoNonceOpen) {
return (filterRequest.hasScriptNonce()
? (empty(customNonceOpen) ? DEFAULT_SCRIPT_NONCE_OPEN : customNonceOpen).replace(NONCE_VAR, filterRequest.getScriptNonce())
: (empty(customNoNonceOpen) ? DEFAULT_SCRIPT_OPEN : customNoNonceOpen);
: (empty(customNoNonceOpen) ? DEFAULT_SCRIPT_OPEN : customNoNonceOpen)
).replace(CHARSET_VAR, charset);
}

protected static String scriptClose (String customClose) {
@@ -153,6 +155,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver {
@Getter(lazy=true) private final String scriptClose = scriptClose(requestModConfig().getScriptClose());

protected InputStream filterInsertJs(InputStream in,
Charset charset,
FilterHttpRequest filterRequest,
Map<String, Object> filterCtx,
String bubbleJsTemplate,
@@ -161,7 +164,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver {
boolean showIcon) {
final RequestModifierConfig modConfig = requestModConfig();
final String replacement = DEFAULT_PREFIX_REPLACEMENT_WITH_MATCH
+ scriptOpen(filterRequest, modConfig.getScriptOpenNonce(), modConfig.getScriptOpenNoNonce())
+ scriptOpen(filterRequest, charset.name(), modConfig.getScriptOpenNonce(), modConfig.getScriptOpenNoNonce())
+ getBubbleJs(filterRequest, filterCtx, bubbleJsTemplate, defaultSiteTemplate, siteJsInsertionVar, showIcon)
+ getScriptClose();

@@ -187,7 +190,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver {
if (alternates != null) {
final BubbleAlternateRegexReplacement firstAlt = alternates.get(0);
if (log.isInfoEnabled()) log.info(prefix + "using alternate filter (0): " +firstAlt);
reader = new RegexFilterReader(new InputStreamReader(in), firstAlt.regexFilter(filterRequest, replacement))
reader = new RegexFilterReader(new InputStreamReader(in, charset), firstAlt.regexFilter(filterRequest, replacement))
.setName(filterNamePrefix + "(alt0: "+firstAlt.getFqdnMatch()+") " + firstAlt.getInsertionRegex())
.setMaxMatches(1);
for (int i=1; i<alternates.size(); i++) {
@@ -200,7 +203,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver {

} else {
if (log.isInfoEnabled()) log.info(prefix + "using default filter: " +getInsertionRegex());
reader = new RegexFilterReader(new InputStreamReader(in), new RegexReplacementFilter(getInsertionRegex(), replacement))
reader = new RegexFilterReader(new InputStreamReader(in, charset), new RegexReplacementFilter(getInsertionRegex(), replacement))
.setName(filterNamePrefix + getInsertionRegex())
.setMaxMatches(1);
}
@@ -213,7 +216,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver {
}
}

return new ReaderInputStream(reader, UTF8cs);
return new ReaderInputStream(reader, charset);
}

protected String getBubbleJs(FilterHttpRequest filterRequest,


+ 5
- 4
bubble-server/src/main/java/bubble/rule/AppRuleDriver.java Zobrazit soubor

@@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.Set;

@@ -155,12 +156,12 @@ public interface AppRuleDriver {

default InputStream doFilterRequest(InputStream in) { return in; }

default InputStream filterResponse(FilterHttpRequest filterRequest, InputStream in) {
if (hasNext()) return doFilterResponse(filterRequest, getNext().filterResponse(filterRequest, in));
return doFilterResponse(filterRequest, in);
default InputStream filterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) {
if (hasNext()) return doFilterResponse(filterRequest, getNext().filterResponse(filterRequest, in, charset), charset);
return doFilterResponse(filterRequest, in, charset);
}

default InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in) { return in; }
default InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) { return in; }

default String resolveResource(String res, Map<String, Object> ctx) {
final String resource = locateResource(res);


+ 5
- 4
bubble-server/src/main/java/bubble/rule/bblock/BubbleBlockRuleDriver.java Zobrazit soubor

@@ -30,6 +30,7 @@ import org.glassfish.jersey.server.ContainerRequest;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicReference;
@@ -331,7 +332,7 @@ public class BubbleBlockRuleDriver extends TrafficAnalyticsRuleDriver
public static final String FILTER_CTX_DECISION = "decision";
public static final String BLOCK_STATS_JS = "BLOCK_STATS_JS";

@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in) {
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) {

final FilterMatchersRequest request = filterRequest.getMatchersResponse().getRequest();
final String prefix = "doFilterResponse("+filterRequest.getId()+"): ";
@@ -384,14 +385,14 @@ public class BubbleBlockRuleDriver extends TrafficAnalyticsRuleDriver
}
if (bubbleBlockConfig.inPageBlocks() && showStats) {
if (log.isInfoEnabled()) log.info(prefix + "SEND: both inPageBlocks and showStats are true, filtering");
return filterInsertJs(in, filterRequest, filterCtx, BUBBLE_JS_TEMPLATE, getBubbleJsStatsTemplate(), BLOCK_STATS_JS, showStats);
return filterInsertJs(in, charset, filterRequest, filterCtx, BUBBLE_JS_TEMPLATE, getBubbleJsStatsTemplate(), BLOCK_STATS_JS, showStats);
}
if (bubbleBlockConfig.inPageBlocks()) {
if (log.isInfoEnabled()) log.info(prefix + "SEND: both inPageBlocks is true, filtering");
return filterInsertJs(in, filterRequest, filterCtx, BUBBLE_JS_TEMPLATE, EMPTY, BLOCK_STATS_JS, showStats);
return filterInsertJs(in, charset, filterRequest, filterCtx, BUBBLE_JS_TEMPLATE, EMPTY, BLOCK_STATS_JS, showStats);
}
if (log.isInfoEnabled()) log.info(prefix+"inserting JS for stats into: "+request.getUrl()+" with Content-Type: "+filterRequest.getContentType());
return filterInsertJs(in, filterRequest, filterCtx, getBubbleJsStatsTemplate(), null, null, showStats);
return filterInsertJs(in, charset, filterRequest, filterCtx, getBubbleJsStatsTemplate(), null, null, showStats);
}

protected String getBubbleJsStatsTemplate () {


+ 3
- 2
bubble-server/src/main/java/bubble/rule/social/block/JsUserBlockerRuleDriver.java Zobrazit soubor

@@ -12,6 +12,7 @@ import lombok.Getter;
import lombok.extern.slf4j.Slf4j;

import java.io.InputStream;
import java.nio.charset.Charset;

import static org.cobbzilla.util.io.FileUtil.basename;
import static org.cobbzilla.util.io.StreamUtil.stream2string;
@@ -38,10 +39,10 @@ public class JsUserBlockerRuleDriver extends AbstractAppRuleDriver implements Re
return loadTemplate(getDefaultSiteJsTemplate(), basename(getRequestModifierConfig().getSiteJsTemplate()));
}

@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in) {
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) {
if (!filterRequest.isHtml()) return in;
final String bubbleJsTemplate = loadTemplate(BUBBLE_JS_TEMPLATE, BUBBLE_JS_TEMPLATE_NAME);
final String siteJsTemplate = getSiteJsTemplate();
return filterInsertJs(in, filterRequest, null, bubbleJsTemplate, siteJsTemplate, CTX_APPLY_BLOCKS_JS, true);
return filterInsertJs(in, charset, filterRequest, null, bubbleJsTemplate, siteJsTemplate, CTX_APPLY_BLOCKS_JS, true);
}
}

+ 4
- 4
bubble-server/src/main/java/bubble/rule/social/block/UserBlockerRuleDriver.java Zobrazit soubor

@@ -18,13 +18,13 @@ import org.cobbzilla.util.io.regex.RegexInsertionFilter;
import org.cobbzilla.util.io.regex.RegexStreamFilter;

import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import static org.cobbzilla.util.daemon.ZillaRuntime.die;
import static org.cobbzilla.util.json.JsonUtil.json;
import static org.cobbzilla.util.string.StringUtil.UTF8cs;

@Slf4j
public class UserBlockerRuleDriver extends AbstractAppRuleDriver {
@@ -60,14 +60,14 @@ public class UserBlockerRuleDriver extends AbstractAppRuleDriver {

protected UserBlockerConfig configObject() { return json(getFullConfig(), UserBlockerConfig.class); }

@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in) {
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) {
if (!filterRequest.isHtml()) return in;

final String requestId = filterRequest.getId();
final UserBlockerStreamFilter filter = new UserBlockerStreamFilter(requestId, matcher, rule, configuration.getHttp().getBaseUri());
filter.configure(getFullConfig());
filter.setDataDAO(appDataDAO);
RegexFilterReader reader = new RegexFilterReader(in, RESPONSE_BUFSIZ, filter).setName("mainFilterReader");
RegexFilterReader reader = new RegexFilterReader(in, charset, RESPONSE_BUFSIZ, filter).setName("mainFilterReader");

final UserBlockerConfig config = configObject();
if (config.hasCommentDecorator()) {
@@ -110,7 +110,7 @@ public class UserBlockerRuleDriver extends AbstractAppRuleDriver {
}
}

return new ReaderInputStream(reader, UTF8cs);
return new ReaderInputStream(reader, charset);
}

protected String startElementRegex(String el) { return "(<\\s*" + el + "[^>]*>)"; }


+ 37
- 9
bubble-server/src/main/java/bubble/service/stream/ActiveStreamState.java Zobrazit soubor

@@ -5,6 +5,9 @@
package bubble.service.stream;

import bubble.resources.stream.FilterHttpRequest;
import bubble.service.stream.charset.BubbleCharSet;
import bubble.service.stream.charset.CharsetDetector;
import lombok.Cleanup;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
@@ -21,9 +24,11 @@ import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;

import static bubble.service.stream.charset.CharsetDetector.charSetDetectorForContentType;
import static java.util.concurrent.TimeUnit.DAYS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.apache.commons.lang3.ArrayUtils.EMPTY_BYTE_ARRAY;
@@ -32,15 +37,11 @@ import static org.cobbzilla.util.daemon.ZillaRuntime.shortError;
import static org.cobbzilla.util.io.NullInputStream.NULL_STREAM;

@Slf4j
class ActiveStreamState {
public class ActiveStreamState {

public static final int DEFAULT_BYTE_BUFFER_SIZE = (int) (8 * Bytes.KB);
public static final long MAX_BYTE_BUFFER_SIZE = (64 * Bytes.KB);

// do not wrap input with encoding stream until we have received at least this many bytes
// this avoids errors when creating a GZIPInputStream when only one or a few bytes are available
public static final long MIN_BYTES_BEFORE_WRAP = Bytes.KB;

// If no data is readable for this long, shut down the underlying MultiStream
public static final long UNDERFLOW_TIMEOUT = SECONDS.toMillis(60);

@@ -69,6 +70,7 @@ class ActiveStreamState {
private InputStream output = null;
private long totalBytesWritten = 0;
private long totalBytesRead = 0;
private CharsetDetector charsetDetector;

public ActiveStreamState(FilterHttpRequest request,
List<AppRuleHarness> rules) {
@@ -76,6 +78,7 @@ class ActiveStreamState {
this.requestId = request.getId();
this.encoding = request.getEncoding();
this.firstRule = rules.get(0);
this.charsetDetector = charSetDetectorForContentType(request.getContentType());

final String prefix = "ActiveStreamState("+reqId()+"): ";
if (empty(rules)) {
@@ -130,8 +133,13 @@ class ActiveStreamState {
}
// do not wrap input with encoding stream until we have received at least MIN_BYTES_BEFORE_WRAP bytes
// this avoids errors when creating a GZIPInputStream when only one or a few bytes are available
if (output == null && totalBytesWritten > MIN_BYTES_BEFORE_WRAP) {
output = outputStream(firstRule.getDriver().filterResponse(request, inputStream(multiStream)));
if (output == null && totalBytesWritten > StreamConstants.MIN_BYTES_BEFORE_WRAP) {
log.info("addChunk: detecting charset using "+charsetDetector.getClass().getSimpleName());
final BubbleCharSet cs = getCharSet(false);
log.info("addChunk: detected charset: "+cs);
if (cs != null) {
output = outputStream(firstRule.getDriver().filterResponse(request, inputStream(multiStream), cs.getCharset()));
}
}
}
}
@@ -147,7 +155,27 @@ class ActiveStreamState {
multiStream.addLastStream(chunkStream);
}
if (output == null) {
output = outputStream(firstRule.getDriver().filterResponse(request, inputStream(multiStream)));
log.info("addLastChunk: detecting charset using "+charsetDetector.getClass().getSimpleName());
final BubbleCharSet cs = getCharSet(true);
log.info("addLastChunk: detected charset: "+cs);
final Charset charset;
if (cs == null) {
log.warn(prefix("addLastChunk")+"no charset could be determined");
charset = null;
} else {
charset = cs.getCharset();
}
output = outputStream(firstRule.getDriver().filterResponse(request, inputStream(multiStream), charset));
}
}

public BubbleCharSet getCharSet(boolean last) throws IOException {
try {
multiStream.mark((int) totalBytesWritten);
@Cleanup final InputStream in = inputStream(multiStream);
return charsetDetector.getCharSet(in, totalBytesWritten, last);
} finally {
multiStream.reset();
}
}

@@ -211,7 +239,7 @@ class ActiveStreamState {
if (log.isDebugEnabled()) log.debug(prefix+"identity encoding, returning baseStream unmodified");
return baseStream;

} else if (doNotWrap.containsKey(url)) {
} else if (url == null || doNotWrap.containsKey(url)) {
if (log.isDebugEnabled()) log.debug(prefix+"previous error wrapping encoding, returning baseStream unmodified");
encoding = null;
return baseStream;


+ 16
- 2
bubble-server/src/main/java/bubble/service/stream/StandardRuleEngineService.java Zobrazit soubor

@@ -18,6 +18,8 @@ import bubble.resources.stream.FilterMatchersRequest;
import bubble.rule.AppRuleDriver;
import bubble.rule.FilterMatchDecision;
import bubble.server.BubbleConfiguration;
import bubble.service.stream.charset.BubbleCharSet;
import bubble.service.stream.charset.CharsetDetector;
import lombok.Cleanup;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
@@ -51,6 +53,7 @@ import javax.ws.rs.core.Response;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@@ -64,7 +67,8 @@ import static java.util.concurrent.TimeUnit.MINUTES;
import static javax.ws.rs.core.HttpHeaders.CONTENT_LENGTH;
import static org.apache.http.HttpHeaders.CONTENT_TYPE;
import static org.apache.http.HttpHeaders.TRANSFER_ENCODING;
import static org.cobbzilla.util.daemon.ZillaRuntime.*;
import static org.cobbzilla.util.daemon.ZillaRuntime.empty;
import static org.cobbzilla.util.daemon.ZillaRuntime.hashOf;
import static org.cobbzilla.util.http.HttpStatusCodes.OK;
import static org.cobbzilla.util.json.JsonUtil.COMPACT_MAPPER;
import static org.cobbzilla.util.json.JsonUtil.json;
@@ -141,7 +145,17 @@ public class StandardRuleEngineService implements RuleEngineService {
// filter response. when stream is closed, close http client
final Header contentTypeHeader = proxyResponse.getFirstHeader(CONTENT_TYPE);
filterRequest.setContentType(contentTypeHeader == null ? null : contentTypeHeader.getValue());
final InputStream responseEntity = firstRule.getDriver().filterResponse(filterRequest, new HttpClosingFilterInputStream(httpClient, proxyResponse));
final InputStream in = new HttpClosingFilterInputStream(httpClient, proxyResponse);

// do we have a content length?
final Header contentLengthHeader = proxyResponse.getFirstHeader(CONTENT_LENGTH);
final Long contentLength = contentLengthHeader == null ? null : Long.parseLong(contentLengthHeader.getValue());
filterRequest.setContentLength(contentLength);

final CharsetDetector charsetDetector = CharsetDetector.charSetDetectorForContentType(filterRequest.getContentType());
final BubbleCharSet cs = charsetDetector.getCharSet(in, contentLength != null ? contentLength : 1024, true);
final Charset charset = cs == null ? null : cs.getCharset();
final InputStream responseEntity = firstRule.getDriver().filterResponse(filterRequest, in, charset);

// send response
return sendResponse(responseEntity, proxyResponse);


+ 11
- 0
bubble-server/src/main/java/bubble/service/stream/StreamConstants.java Zobrazit soubor

@@ -0,0 +1,11 @@
package bubble.service.stream;

import org.cobbzilla.util.system.Bytes;

public class StreamConstants {

// do not wrap input with encoding stream until we have received at least this many bytes
// this avoids errors when creating a GZIPInputStream when only one or a few bytes are available
public static final long MIN_BYTES_BEFORE_WRAP = Bytes.KB;

}

+ 27
- 0
bubble-server/src/main/java/bubble/service/stream/charset/BubbleCharSet.java Zobrazit soubor

@@ -0,0 +1,27 @@
package bubble.service.stream.charset;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;

import java.nio.charset.Charset;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import static org.cobbzilla.util.string.StringUtil.UTF8cs;

@AllArgsConstructor @ToString(of="charset")
public class BubbleCharSet {

private static final Map<Charset, BubbleCharSet> cache = new ConcurrentHashMap<>(10);

public static BubbleCharSet forCharSet(Charset cs) {
return cache.computeIfAbsent(cs, BubbleCharSet::new);
}

public static final BubbleCharSet RAW = new BubbleCharSet(null);
public static final BubbleCharSet UTF8 = forCharSet(UTF8cs);

@Getter private final Charset charset;

}

+ 23
- 0
bubble-server/src/main/java/bubble/service/stream/charset/CharsetDetector.java Zobrazit soubor

@@ -0,0 +1,23 @@
package bubble.service.stream.charset;

import java.io.InputStream;

import static bubble.service.stream.charset.HtmlCharsetDetector.htmlCharSetDetector;
import static org.cobbzilla.util.http.HttpContentTypes.isHtml;

public interface CharsetDetector {

CharsetDetector SKIP_CHARSET_DETECTION = new SkipCharsetDetection();

static CharsetDetector charSetDetectorForContentType(String contentType) {
if (isHtml(contentType)) return htmlCharSetDetector(contentType);
return SKIP_CHARSET_DETECTION;
}

BubbleCharSet getCharSet(InputStream in, long size, boolean last);

class SkipCharsetDetection implements CharsetDetector {
@Override public BubbleCharSet getCharSet(InputStream in, long size, boolean last) { return BubbleCharSet.RAW; }
}

}

+ 43
- 0
bubble-server/src/main/java/bubble/service/stream/charset/HtmlCharsetDetector.java Zobrazit soubor

@@ -0,0 +1,43 @@
package bubble.service.stream.charset;

import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import static bubble.service.stream.charset.HtmlStreamCharsetDetector.HTML_STREAM_CHARSET_DETECTOR;

@Slf4j
public abstract class HtmlCharsetDetector implements CharsetDetector {

private static final Map<String, HtmlCharsetDetector> detectors = new ConcurrentHashMap<>(10);

public static final String CONTENT_TYPE_CHARSET = "charset=";

public static CharsetDetector htmlCharSetDetector(String contentType) {
return detectors.computeIfAbsent(contentType, ct -> {
final int csPos = ct.indexOf(CONTENT_TYPE_CHARSET);
if (csPos == -1) return HTML_STREAM_CHARSET_DETECTOR;
final String charsetName = ct.substring(csPos + CONTENT_TYPE_CHARSET.length());
try {
final Charset cs = Charset.forName(charsetName);
return new HtmlContentTypeCharSet(cs);
} catch (Exception e) {
log.error("htmlCharSetDetector: invalid charset, returning HtmlStreamCharsetDetector: "+charsetName);
return HTML_STREAM_CHARSET_DETECTOR;
}
});
}

@AllArgsConstructor
public static class HtmlContentTypeCharSet extends HtmlCharsetDetector {
private final Charset charset;
@Override public BubbleCharSet getCharSet(InputStream in, long size, boolean last) {
return BubbleCharSet.forCharSet(charset);
}
}

}

+ 92
- 0
bubble-server/src/main/java/bubble/service/stream/charset/HtmlStreamCharsetDetector.java Zobrazit soubor

@@ -0,0 +1,92 @@
package bubble.service.stream.charset;

import lombok.extern.slf4j.Slf4j;

import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static bubble.service.stream.StreamConstants.MIN_BYTES_BEFORE_WRAP;
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError;
import static org.cobbzilla.util.string.StringUtil.UTF8cs;

@Slf4j
public class HtmlStreamCharsetDetector extends HtmlCharsetDetector {

public static final HtmlStreamCharsetDetector HTML_STREAM_CHARSET_DETECTOR = new HtmlStreamCharsetDetector();

private static final Pattern HTML_CONTENT_TYPE_EQUIV_CHARSET
= Pattern.compile("<meta\\s+http-equiv\\s*=\\s*\"Content-Type\"\\s+content=\"[/\\w]+\\s*;\\s*charset=([-\\w]+)\\s*\"\\s*>", Pattern.CASE_INSENSITIVE);

private static final Pattern HTML_META_CHARSET
= Pattern.compile("<meta\\s+charset\\s*=\\s*\"([-\\w]+)\">", Pattern.CASE_INSENSITIVE);

private static final Pattern HTML_CLOSE_HEAD
= Pattern.compile("</head[^>]*>", Pattern.CASE_INSENSITIVE);

@Override public BubbleCharSet getCharSet(InputStream in, long size, boolean last) {
final byte[] buffer = new byte[(int) MIN_BYTES_BEFORE_WRAP];
int count;
String fullData = null;
try {
final StringBuilder b = new StringBuilder();
int bytesRead = 0;
boolean zeroRead = false;
while (bytesRead < size && (count = in.read(buffer, 0, readSize(size, buffer.length, bytesRead))) != -1) {
if (count == 0) {
// reached end of multi-stream, if this is our second zero read, bail out
if (zeroRead) {
if (last) {
if (log.isDebugEnabled()) log.debug("getCharSet: exhausted stream and no match found, returning UTF-8");
return BubbleCharSet.UTF8;
}
if (log.isDebugEnabled()) log.debug("getCharSet: two zero reads, must be at end of multi-stream, returning null");
return null;
}
zeroRead = true;
}
final String data = new String(buffer, 0, count);
b.append(data);
fullData = b.toString();
final Matcher metaMatcher = HTML_META_CHARSET.matcher(fullData);
if (metaMatcher.find()) {
return BubbleCharSet.forCharSet(safeCharSet(metaMatcher.group(1)));
}
final Matcher equivMatcher = HTML_CONTENT_TYPE_EQUIV_CHARSET.matcher(fullData);
if (equivMatcher.find()) {
return BubbleCharSet.forCharSet(safeCharSet(equivMatcher.group(1)));
}
final Matcher headCloseMatcher = HTML_CLOSE_HEAD.matcher(fullData);
if (headCloseMatcher.find()) {
if (log.isDebugEnabled()) log.debug("getCharSet: found head closing tag before any charset specifier, returning UTF-8");
return BubbleCharSet.UTF8;
}
}
if (last) {
if (log.isDebugEnabled()) log.debug("getCharSet: exhausted stream and no match found, returning UTF-8");
return BubbleCharSet.UTF8;
}
if (log.isDebugEnabled()) log.debug("getCharSet: exhausted stream and no match found, but more data may be coming, returning null");
return null;

} catch (Exception e) {
log.error("getCharSet: io error, returning UTF-8: "+shortError(e));
return BubbleCharSet.UTF8;
}
}

private Charset safeCharSet(String csName) {
try {
return Charset.forName(csName);
} catch (Exception e) {
log.error("safeCharSet: invalid name, returning UTF-8: "+csName);
return UTF8cs;
}
}

private int readSize(long size, int bufsiz, int bytesRead) {
return bytesRead + bufsiz < size ? bufsiz : (int) (bufsiz - (size - bytesRead));
}

}

+ 138
- 0
bubble-server/src/test/java/bubble/test/filter/CharsetDetectionTest.java Zobrazit soubor

@@ -0,0 +1,138 @@
package bubble.test.filter;

import bubble.model.app.AppMatcher;
import bubble.model.app.AppRule;
import bubble.resources.stream.FilterHttpRequest;
import bubble.resources.stream.FilterMatchersRequest;
import bubble.resources.stream.FilterMatchersResponse;
import bubble.service.stream.ActiveStreamState;
import bubble.service.stream.AppRuleHarness;
import org.apache.commons.io.IOUtils;
import org.cobbzilla.util.collection.SingletonList;
import org.cobbzilla.util.http.HttpContentEncodingType;
import org.junit.Test;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

import static org.cobbzilla.util.http.HttpContentEncodingType.*;
import static org.cobbzilla.util.http.HttpContentTypes.TEXT_HTML;
import static org.cobbzilla.util.io.StreamUtil.stream2bytes;
import static org.cobbzilla.util.security.ShaUtil.sha256_base64;
import static org.junit.Assert.*;

public class CharsetDetectionTest {

public static final byte[] WIN_1250_TEST = stream2bytes("charset_detection/meta-windows-1250.html");
public static final byte[] WIN_1250_LATE_TEST = stream2bytes("charset_detection/meta-windows-1250-late.html");
public static final byte[] WIN_1250_EQUIV_TEST = stream2bytes("charset_detection/equiv-windows-1250.html");

@Test public void testNonUTF8Charset () throws Exception {
// read first chunk exactly 288 bytes, so it ends in the middle of "charset"
_testNonUTF8Charset(WIN_1250_TEST, 288, null);
}

@Test public void testNonUTF8Charset_gzip () throws Exception {
// for gzip we won't be able to break exactly on charset,
// but try a small read anyway to make sure nothing breaks
_testNonUTF8Charset(gzip.encode(WIN_1250_TEST), 288, gzip);
}

@Test public void testNonUTF8CharsetLate () throws Exception {
_testNonUTF8Charset(WIN_1250_LATE_TEST, 1024, null);
}

@Test public void testNonUTF8CharsetLate_brotli () throws Exception {
_testNonUTF8Charset(br.encode(WIN_1250_LATE_TEST), 1024, br);
}

@Test public void testNonUTF8CharsetEquiv () throws Exception {
_testNonUTF8Charset(WIN_1250_EQUIV_TEST, 1024, null);
}

@Test public void testNonUTF8CharsetEquiv_deflate () throws Exception {
_testNonUTF8Charset(deflate.encode(WIN_1250_EQUIV_TEST), 1024, deflate);
}

private void _testNonUTF8Charset(byte[] test, int initialReadSize, HttpContentEncodingType encoding) throws Exception {
final FilterHttpRequest request = new FilterHttpRequest()
.setMatchersResponse(new FilterMatchersResponse()
.setMatchers(new SingletonList<>(new AppMatcher()))
.setRequest(new FilterMatchersRequest()
.setFqdn("example.com")
.setUri("/test_"+sha256_base64(test)+".html")))
.setContentType(TEXT_HTML)
.setEncoding(encoding);
final List<AppRuleHarness> rules = new ArrayList<>();
final PassthruDriver driver = new PassthruDriver();
final AppRuleHarness passthruRuleHarness = passthruRuleHarness(driver);
rules.add(passthruRuleHarness);

final ActiveStreamState streamState = new ActiveStreamState(request, rules);

final byte[] buffer = new byte[8192];
final ByteArrayInputStream in = new ByteArrayInputStream(test);
final ByteArrayOutputStream out = new ByteArrayOutputStream(test.length);

// add first chunk, no charset yet found
final byte[] buf = new byte[8192];
final int initialActualRead = in.read(buffer, 0, initialReadSize);
assertEquals("expected first read to read "+initialReadSize+"bytes", initialReadSize, initialActualRead);
System.arraycopy(buffer, 0, buf, 0, initialReadSize);
streamState.addChunk(new ByteArrayInputStream(buf, 0, initialReadSize), initialReadSize);
Charset charset = driver.getLastSeenCharset();
assertNull("expected no charset to be found in the first chunk", charset);

// do not expect to have found a charset yet
InputStream response = streamState.getResponseStream(false);
IOUtils.copyLarge(response, out);

// add remaining chunks, while reading data back
int count;
Charset lastSeenCharset = null;
int responseCount = 0;
while ((count = in.read(buffer)) != -1) {
System.arraycopy(buffer, 0, buf, 0, count);
streamState.addChunk(new ByteArrayInputStream(buf, 0, count), count);
response = streamState.getResponseStream(false);
responseCount++;
charset = driver.getLastSeenCharset();
if (charset != null) {
if (lastSeenCharset == null) {
lastSeenCharset = charset;
} else {
// charset cannot change
assertEquals("expected charset to be same as lastSeenCharset", lastSeenCharset, charset);
}
// charset must be windows-1250
assertEquals("expected windows-1250 charset", "windows-1250", charset.name());
}
IOUtils.copyLarge(response, out);
}
assertNotNull("expected to find a charset", lastSeenCharset);
assertEquals("expected windows-1250 charset", "windows-1250", lastSeenCharset.name());

// add last empty chunk
streamState.addLastChunk(new ByteArrayInputStream(new byte[0]), 0);

// read the data back
response = streamState.getResponseStream(true);
IOUtils.copyLarge(response, out);

final byte[] actualBytes = out.toByteArray();
final String expectedHtml = new String(encoding == null ? test : encoding.decode(test), lastSeenCharset);
final String actualHtml = new String(encoding == null ? actualBytes : encoding.decode(actualBytes), lastSeenCharset);
assertEquals("expected output to be same as input", expectedHtml, actualHtml);
}

private AppRuleHarness passthruRuleHarness(PassthruDriver driver) {
final AppRuleHarness appRuleHarness = new AppRuleHarness(new AppMatcher(), new AppRule());
appRuleHarness.setDriver(driver);
return appRuleHarness;
}

}

+ 21
- 0
bubble-server/src/test/java/bubble/test/filter/PassthruDriver.java Zobrazit soubor

@@ -0,0 +1,21 @@
package bubble.test.filter;

import bubble.resources.stream.FilterHttpRequest;
import bubble.rule.AbstractAppRuleDriver;
import lombok.Getter;

import java.io.InputStream;
import java.nio.charset.Charset;

public class PassthruDriver extends AbstractAppRuleDriver {

@Getter private Charset lastSeenCharset;

@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) {
this.lastSeenCharset = charset;
return super.doFilterResponse(filterRequest, in, charset);
}

@Override public boolean couldModify(FilterHttpRequest request) { return true; }

}

+ 2552
- 0
bubble-server/src/test/resources/charset_detection/equiv-windows-1250.html
Diff nebyl zobrazen, protože je příliš veliký
Zobrazit soubor


+ 2569
- 0
bubble-server/src/test/resources/charset_detection/meta-windows-1250-late.html
Diff nebyl zobrazen, protože je příliš veliký
Zobrazit soubor


+ 2554
- 0
bubble-server/src/test/resources/charset_detection/meta-windows-1250.html
Diff nebyl zobrazen, protože je příliš veliký
Zobrazit soubor


+ 2
- 0
pom.xml Zobrazit soubor

@@ -73,6 +73,7 @@
<include>bubble.test.promo.AccountCreditTest</include>
<include>bubble.test.promo.MultiplePromotionsTest</include>
<include>bubble.test.system.DriverTest</include>
<include>bubble.test.filter.CharsetDetectionTest</include>
<include>bubble.test.filter.ProxyTest</include>
<include>bubble.test.filter.TrafficAnalyticsTest</include>
<include>bubble.test.filter.BlockSummaryTest</include>
@@ -80,6 +81,7 @@
<include>bubble.test.system.BackupTest</include>
<include>bubble.test.system.NetworkTest</include>
<include>bubble.abp.spec.BlockListTest</include>
<include>org.cobbzilla.util.io.regex.RegexFilterReaderTest</include>
</includes>
</configuration>
</plugin>


+ 1
- 1
utils/cobbzilla-parent

@@ -1 +1 @@
Subproject commit bcb92061c3482ba4803c0030c78fb550206a0511
Subproject commit 2324f5d196fa52931f07e85c830bf9c10465b8f8

+ 1
- 1
utils/cobbzilla-utils

@@ -1 +1 @@
Subproject commit 1cfe96e59ca2c98955a379e539183a5596dab33f
Subproject commit a9d3d69f1112a47b8d69f9343cc66caf5bab1383

+ 1
- 1
utils/cobbzilla-wizard

@@ -1 +1 @@
Subproject commit cd9cf18c90a373f024163c048d57cf574ec97293
Subproject commit e91152f296a04ce1e4363ae90e1cfde70cdee4d5

Načítá se…
Zrušit
Uložit