@@ -36,6 +36,7 @@ import org.springframework.beans.factory.annotation.Autowired; | |||
import java.io.File; | |||
import java.io.InputStream; | |||
import java.io.InputStreamReader; | |||
import java.nio.charset.Charset; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import java.util.List; | |||
@@ -51,7 +52,6 @@ import static org.cobbzilla.util.io.FileUtil.basename; | |||
import static org.cobbzilla.util.io.regex.RegexReplacementFilter.DEFAULT_PREFIX_REPLACEMENT_WITH_MATCH; | |||
import static org.cobbzilla.util.json.JsonUtil.json; | |||
import static org.cobbzilla.util.security.ShaUtil.sha256_hex; | |||
import static org.cobbzilla.util.string.StringUtil.UTF8cs; | |||
public abstract class AbstractAppRuleDriver implements AppRuleDriver { | |||
@@ -105,19 +105,21 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver { | |||
} | |||
public static final String DEFAULT_INSERTION_REGEX = "<\\s*head[^>]*>"; | |||
public static final String DEFAULT_SCRIPT_OPEN = "<meta charset=\"UTF-8\"><script>"; | |||
public static final String CHARSET_VAR = "{{charset}}"; | |||
public static final String DEFAULT_SCRIPT_OPEN = "<meta charset=\""+CHARSET_VAR+"\"><script>"; | |||
public static final String NONCE_VAR = "{{nonce}}"; | |||
public static final String DEFAULT_SCRIPT_NONCE_OPEN = "<meta charset=\"UTF-8\"><script nonce=\""+NONCE_VAR+"\">"; | |||
public static final String DEFAULT_SCRIPT_NONCE_OPEN = "<meta charset=\""+CHARSET_VAR+"\"><script nonce=\""+NONCE_VAR+"\">"; | |||
public static final String DEFAULT_SCRIPT_CLOSE = "</script>"; | |||
protected static String insertionRegex (String customRegex) { | |||
return empty(customRegex) ? DEFAULT_INSERTION_REGEX : customRegex; | |||
} | |||
protected static String scriptOpen (FilterHttpRequest filterRequest, String customNonceOpen, String customNoNonceOpen) { | |||
return filterRequest.hasScriptNonce() | |||
protected static String scriptOpen (FilterHttpRequest filterRequest, String charset, String customNonceOpen, String customNoNonceOpen) { | |||
return (filterRequest.hasScriptNonce() | |||
? (empty(customNonceOpen) ? DEFAULT_SCRIPT_NONCE_OPEN : customNonceOpen).replace(NONCE_VAR, filterRequest.getScriptNonce()) | |||
: (empty(customNoNonceOpen) ? DEFAULT_SCRIPT_OPEN : customNoNonceOpen); | |||
: (empty(customNoNonceOpen) ? DEFAULT_SCRIPT_OPEN : customNoNonceOpen) | |||
).replace(CHARSET_VAR, charset); | |||
} | |||
protected static String scriptClose (String customClose) { | |||
@@ -153,6 +155,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver { | |||
@Getter(lazy=true) private final String scriptClose = scriptClose(requestModConfig().getScriptClose()); | |||
protected InputStream filterInsertJs(InputStream in, | |||
Charset charset, | |||
FilterHttpRequest filterRequest, | |||
Map<String, Object> filterCtx, | |||
String bubbleJsTemplate, | |||
@@ -161,7 +164,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver { | |||
boolean showIcon) { | |||
final RequestModifierConfig modConfig = requestModConfig(); | |||
final String replacement = DEFAULT_PREFIX_REPLACEMENT_WITH_MATCH | |||
+ scriptOpen(filterRequest, modConfig.getScriptOpenNonce(), modConfig.getScriptOpenNoNonce()) | |||
+ scriptOpen(filterRequest, charset.name(), modConfig.getScriptOpenNonce(), modConfig.getScriptOpenNoNonce()) | |||
+ getBubbleJs(filterRequest, filterCtx, bubbleJsTemplate, defaultSiteTemplate, siteJsInsertionVar, showIcon) | |||
+ getScriptClose(); | |||
@@ -187,7 +190,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver { | |||
if (alternates != null) { | |||
final BubbleAlternateRegexReplacement firstAlt = alternates.get(0); | |||
if (log.isInfoEnabled()) log.info(prefix + "using alternate filter (0): " +firstAlt); | |||
reader = new RegexFilterReader(new InputStreamReader(in), firstAlt.regexFilter(filterRequest, replacement)) | |||
reader = new RegexFilterReader(new InputStreamReader(in, charset), firstAlt.regexFilter(filterRequest, replacement)) | |||
.setName(filterNamePrefix + "(alt0: "+firstAlt.getFqdnMatch()+") " + firstAlt.getInsertionRegex()) | |||
.setMaxMatches(1); | |||
for (int i=1; i<alternates.size(); i++) { | |||
@@ -200,7 +203,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver { | |||
} else { | |||
if (log.isInfoEnabled()) log.info(prefix + "using default filter: " +getInsertionRegex()); | |||
reader = new RegexFilterReader(new InputStreamReader(in), new RegexReplacementFilter(getInsertionRegex(), replacement)) | |||
reader = new RegexFilterReader(new InputStreamReader(in, charset), new RegexReplacementFilter(getInsertionRegex(), replacement)) | |||
.setName(filterNamePrefix + getInsertionRegex()) | |||
.setMaxMatches(1); | |||
} | |||
@@ -213,7 +216,7 @@ public abstract class AbstractAppRuleDriver implements AppRuleDriver { | |||
} | |||
} | |||
return new ReaderInputStream(reader, UTF8cs); | |||
return new ReaderInputStream(reader, charset); | |||
} | |||
protected String getBubbleJs(FilterHttpRequest filterRequest, | |||
@@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory; | |||
import java.io.ByteArrayInputStream; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
import java.util.Map; | |||
import java.util.Set; | |||
@@ -155,12 +156,12 @@ public interface AppRuleDriver { | |||
default InputStream doFilterRequest(InputStream in) { return in; } | |||
default InputStream filterResponse(FilterHttpRequest filterRequest, InputStream in) { | |||
if (hasNext()) return doFilterResponse(filterRequest, getNext().filterResponse(filterRequest, in)); | |||
return doFilterResponse(filterRequest, in); | |||
default InputStream filterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) { | |||
if (hasNext()) return doFilterResponse(filterRequest, getNext().filterResponse(filterRequest, in, charset), charset); | |||
return doFilterResponse(filterRequest, in, charset); | |||
} | |||
default InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in) { return in; } | |||
default InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) { return in; } | |||
default String resolveResource(String res, Map<String, Object> ctx) { | |||
final String resource = locateResource(res); | |||
@@ -30,6 +30,7 @@ import org.glassfish.jersey.server.ContainerRequest; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.net.URI; | |||
import java.nio.charset.Charset; | |||
import java.util.*; | |||
import java.util.concurrent.ConcurrentHashMap; | |||
import java.util.concurrent.atomic.AtomicReference; | |||
@@ -331,7 +332,7 @@ public class BubbleBlockRuleDriver extends TrafficAnalyticsRuleDriver | |||
public static final String FILTER_CTX_DECISION = "decision"; | |||
public static final String BLOCK_STATS_JS = "BLOCK_STATS_JS"; | |||
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in) { | |||
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) { | |||
final FilterMatchersRequest request = filterRequest.getMatchersResponse().getRequest(); | |||
final String prefix = "doFilterResponse("+filterRequest.getId()+"): "; | |||
@@ -384,14 +385,14 @@ public class BubbleBlockRuleDriver extends TrafficAnalyticsRuleDriver | |||
} | |||
if (bubbleBlockConfig.inPageBlocks() && showStats) { | |||
if (log.isInfoEnabled()) log.info(prefix + "SEND: both inPageBlocks and showStats are true, filtering"); | |||
return filterInsertJs(in, filterRequest, filterCtx, BUBBLE_JS_TEMPLATE, getBubbleJsStatsTemplate(), BLOCK_STATS_JS, showStats); | |||
return filterInsertJs(in, charset, filterRequest, filterCtx, BUBBLE_JS_TEMPLATE, getBubbleJsStatsTemplate(), BLOCK_STATS_JS, showStats); | |||
} | |||
if (bubbleBlockConfig.inPageBlocks()) { | |||
if (log.isInfoEnabled()) log.info(prefix + "SEND: both inPageBlocks is true, filtering"); | |||
return filterInsertJs(in, filterRequest, filterCtx, BUBBLE_JS_TEMPLATE, EMPTY, BLOCK_STATS_JS, showStats); | |||
return filterInsertJs(in, charset, filterRequest, filterCtx, BUBBLE_JS_TEMPLATE, EMPTY, BLOCK_STATS_JS, showStats); | |||
} | |||
if (log.isInfoEnabled()) log.info(prefix+"inserting JS for stats into: "+request.getUrl()+" with Content-Type: "+filterRequest.getContentType()); | |||
return filterInsertJs(in, filterRequest, filterCtx, getBubbleJsStatsTemplate(), null, null, showStats); | |||
return filterInsertJs(in, charset, filterRequest, filterCtx, getBubbleJsStatsTemplate(), null, null, showStats); | |||
} | |||
protected String getBubbleJsStatsTemplate () { | |||
@@ -12,6 +12,7 @@ import lombok.Getter; | |||
import lombok.extern.slf4j.Slf4j; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
import static org.cobbzilla.util.io.FileUtil.basename; | |||
import static org.cobbzilla.util.io.StreamUtil.stream2string; | |||
@@ -38,10 +39,10 @@ public class JsUserBlockerRuleDriver extends AbstractAppRuleDriver implements Re | |||
return loadTemplate(getDefaultSiteJsTemplate(), basename(getRequestModifierConfig().getSiteJsTemplate())); | |||
} | |||
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in) { | |||
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) { | |||
if (!filterRequest.isHtml()) return in; | |||
final String bubbleJsTemplate = loadTemplate(BUBBLE_JS_TEMPLATE, BUBBLE_JS_TEMPLATE_NAME); | |||
final String siteJsTemplate = getSiteJsTemplate(); | |||
return filterInsertJs(in, filterRequest, null, bubbleJsTemplate, siteJsTemplate, CTX_APPLY_BLOCKS_JS, true); | |||
return filterInsertJs(in, charset, filterRequest, null, bubbleJsTemplate, siteJsTemplate, CTX_APPLY_BLOCKS_JS, true); | |||
} | |||
} |
@@ -18,13 +18,13 @@ import org.cobbzilla.util.io.regex.RegexInsertionFilter; | |||
import org.cobbzilla.util.io.regex.RegexStreamFilter; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
import java.util.HashMap; | |||
import java.util.Map; | |||
import java.util.Set; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.die; | |||
import static org.cobbzilla.util.json.JsonUtil.json; | |||
import static org.cobbzilla.util.string.StringUtil.UTF8cs; | |||
@Slf4j | |||
public class UserBlockerRuleDriver extends AbstractAppRuleDriver { | |||
@@ -60,14 +60,14 @@ public class UserBlockerRuleDriver extends AbstractAppRuleDriver { | |||
protected UserBlockerConfig configObject() { return json(getFullConfig(), UserBlockerConfig.class); } | |||
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in) { | |||
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) { | |||
if (!filterRequest.isHtml()) return in; | |||
final String requestId = filterRequest.getId(); | |||
final UserBlockerStreamFilter filter = new UserBlockerStreamFilter(requestId, matcher, rule, configuration.getHttp().getBaseUri()); | |||
filter.configure(getFullConfig()); | |||
filter.setDataDAO(appDataDAO); | |||
RegexFilterReader reader = new RegexFilterReader(in, RESPONSE_BUFSIZ, filter).setName("mainFilterReader"); | |||
RegexFilterReader reader = new RegexFilterReader(in, charset, RESPONSE_BUFSIZ, filter).setName("mainFilterReader"); | |||
final UserBlockerConfig config = configObject(); | |||
if (config.hasCommentDecorator()) { | |||
@@ -110,7 +110,7 @@ public class UserBlockerRuleDriver extends AbstractAppRuleDriver { | |||
} | |||
} | |||
return new ReaderInputStream(reader, UTF8cs); | |||
return new ReaderInputStream(reader, charset); | |||
} | |||
protected String startElementRegex(String el) { return "(<\\s*" + el + "[^>]*>)"; } | |||
@@ -5,6 +5,9 @@ | |||
package bubble.service.stream; | |||
import bubble.resources.stream.FilterHttpRequest; | |||
import bubble.service.stream.charset.BubbleCharSet; | |||
import bubble.service.stream.charset.CharsetDetector; | |||
import lombok.Cleanup; | |||
import lombok.Getter; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.apache.commons.io.IOUtils; | |||
@@ -21,9 +24,11 @@ import java.io.ByteArrayInputStream; | |||
import java.io.ByteArrayOutputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
import java.util.List; | |||
import java.util.Map; | |||
import static bubble.service.stream.charset.CharsetDetector.charSetDetectorForContentType; | |||
import static java.util.concurrent.TimeUnit.DAYS; | |||
import static java.util.concurrent.TimeUnit.SECONDS; | |||
import static org.apache.commons.lang3.ArrayUtils.EMPTY_BYTE_ARRAY; | |||
@@ -32,15 +37,11 @@ import static org.cobbzilla.util.daemon.ZillaRuntime.shortError; | |||
import static org.cobbzilla.util.io.NullInputStream.NULL_STREAM; | |||
@Slf4j | |||
class ActiveStreamState { | |||
public class ActiveStreamState { | |||
public static final int DEFAULT_BYTE_BUFFER_SIZE = (int) (8 * Bytes.KB); | |||
public static final long MAX_BYTE_BUFFER_SIZE = (64 * Bytes.KB); | |||
// do not wrap input with encoding stream until we have received at least this many bytes | |||
// this avoids errors when creating a GZIPInputStream when only one or a few bytes are available | |||
public static final long MIN_BYTES_BEFORE_WRAP = Bytes.KB; | |||
// If no data is readable for this long, shut down the underlying MultiStream | |||
public static final long UNDERFLOW_TIMEOUT = SECONDS.toMillis(60); | |||
@@ -69,6 +70,7 @@ class ActiveStreamState { | |||
private InputStream output = null; | |||
private long totalBytesWritten = 0; | |||
private long totalBytesRead = 0; | |||
private CharsetDetector charsetDetector; | |||
public ActiveStreamState(FilterHttpRequest request, | |||
List<AppRuleHarness> rules) { | |||
@@ -76,6 +78,7 @@ class ActiveStreamState { | |||
this.requestId = request.getId(); | |||
this.encoding = request.getEncoding(); | |||
this.firstRule = rules.get(0); | |||
this.charsetDetector = charSetDetectorForContentType(request.getContentType()); | |||
final String prefix = "ActiveStreamState("+reqId()+"): "; | |||
if (empty(rules)) { | |||
@@ -130,8 +133,13 @@ class ActiveStreamState { | |||
} | |||
// do not wrap input with encoding stream until we have received at least MIN_BYTES_BEFORE_WRAP bytes | |||
// this avoids errors when creating a GZIPInputStream when only one or a few bytes are available | |||
if (output == null && totalBytesWritten > MIN_BYTES_BEFORE_WRAP) { | |||
output = outputStream(firstRule.getDriver().filterResponse(request, inputStream(multiStream))); | |||
if (output == null && totalBytesWritten > StreamConstants.MIN_BYTES_BEFORE_WRAP) { | |||
log.info("addChunk: detecting charset using "+charsetDetector.getClass().getSimpleName()); | |||
final BubbleCharSet cs = getCharSet(false); | |||
log.info("addChunk: detected charset: "+cs); | |||
if (cs != null) { | |||
output = outputStream(firstRule.getDriver().filterResponse(request, inputStream(multiStream), cs.getCharset())); | |||
} | |||
} | |||
} | |||
} | |||
@@ -147,7 +155,27 @@ class ActiveStreamState { | |||
multiStream.addLastStream(chunkStream); | |||
} | |||
if (output == null) { | |||
output = outputStream(firstRule.getDriver().filterResponse(request, inputStream(multiStream))); | |||
log.info("addLastChunk: detecting charset using "+charsetDetector.getClass().getSimpleName()); | |||
final BubbleCharSet cs = getCharSet(true); | |||
log.info("addLastChunk: detected charset: "+cs); | |||
final Charset charset; | |||
if (cs == null) { | |||
log.warn(prefix("addLastChunk")+"no charset could be determined"); | |||
charset = null; | |||
} else { | |||
charset = cs.getCharset(); | |||
} | |||
output = outputStream(firstRule.getDriver().filterResponse(request, inputStream(multiStream), charset)); | |||
} | |||
} | |||
public BubbleCharSet getCharSet(boolean last) throws IOException { | |||
try { | |||
multiStream.mark((int) totalBytesWritten); | |||
@Cleanup final InputStream in = inputStream(multiStream); | |||
return charsetDetector.getCharSet(in, totalBytesWritten, last); | |||
} finally { | |||
multiStream.reset(); | |||
} | |||
} | |||
@@ -211,7 +239,7 @@ class ActiveStreamState { | |||
if (log.isDebugEnabled()) log.debug(prefix+"identity encoding, returning baseStream unmodified"); | |||
return baseStream; | |||
} else if (doNotWrap.containsKey(url)) { | |||
} else if (url == null || doNotWrap.containsKey(url)) { | |||
if (log.isDebugEnabled()) log.debug(prefix+"previous error wrapping encoding, returning baseStream unmodified"); | |||
encoding = null; | |||
return baseStream; | |||
@@ -18,6 +18,8 @@ import bubble.resources.stream.FilterMatchersRequest; | |||
import bubble.rule.AppRuleDriver; | |||
import bubble.rule.FilterMatchDecision; | |||
import bubble.server.BubbleConfiguration; | |||
import bubble.service.stream.charset.BubbleCharSet; | |||
import bubble.service.stream.charset.CharsetDetector; | |||
import lombok.Cleanup; | |||
import lombok.Getter; | |||
import lombok.extern.slf4j.Slf4j; | |||
@@ -51,6 +53,7 @@ import javax.ws.rs.core.Response; | |||
import java.io.ByteArrayOutputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.List; | |||
@@ -64,7 +67,8 @@ import static java.util.concurrent.TimeUnit.MINUTES; | |||
import static javax.ws.rs.core.HttpHeaders.CONTENT_LENGTH; | |||
import static org.apache.http.HttpHeaders.CONTENT_TYPE; | |||
import static org.apache.http.HttpHeaders.TRANSFER_ENCODING; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.*; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.empty; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.hashOf; | |||
import static org.cobbzilla.util.http.HttpStatusCodes.OK; | |||
import static org.cobbzilla.util.json.JsonUtil.COMPACT_MAPPER; | |||
import static org.cobbzilla.util.json.JsonUtil.json; | |||
@@ -141,7 +145,17 @@ public class StandardRuleEngineService implements RuleEngineService { | |||
// filter response. when stream is closed, close http client | |||
final Header contentTypeHeader = proxyResponse.getFirstHeader(CONTENT_TYPE); | |||
filterRequest.setContentType(contentTypeHeader == null ? null : contentTypeHeader.getValue()); | |||
final InputStream responseEntity = firstRule.getDriver().filterResponse(filterRequest, new HttpClosingFilterInputStream(httpClient, proxyResponse)); | |||
final InputStream in = new HttpClosingFilterInputStream(httpClient, proxyResponse); | |||
// do we have a content length? | |||
final Header contentLengthHeader = proxyResponse.getFirstHeader(CONTENT_LENGTH); | |||
final Long contentLength = contentLengthHeader == null ? null : Long.parseLong(contentLengthHeader.getValue()); | |||
filterRequest.setContentLength(contentLength); | |||
final CharsetDetector charsetDetector = CharsetDetector.charSetDetectorForContentType(filterRequest.getContentType()); | |||
final BubbleCharSet cs = charsetDetector.getCharSet(in, contentLength != null ? contentLength : 1024, true); | |||
final Charset charset = cs == null ? null : cs.getCharset(); | |||
final InputStream responseEntity = firstRule.getDriver().filterResponse(filterRequest, in, charset); | |||
// send response | |||
return sendResponse(responseEntity, proxyResponse); | |||
@@ -0,0 +1,11 @@ | |||
package bubble.service.stream; | |||
import org.cobbzilla.util.system.Bytes; | |||
public class StreamConstants { | |||
// do not wrap input with encoding stream until we have received at least this many bytes | |||
// this avoids errors when creating a GZIPInputStream when only one or a few bytes are available | |||
public static final long MIN_BYTES_BEFORE_WRAP = Bytes.KB; | |||
} |
@@ -0,0 +1,27 @@ | |||
package bubble.service.stream.charset; | |||
import lombok.AllArgsConstructor; | |||
import lombok.Getter; | |||
import lombok.ToString; | |||
import java.nio.charset.Charset; | |||
import java.util.Map; | |||
import java.util.concurrent.ConcurrentHashMap; | |||
import static org.cobbzilla.util.string.StringUtil.UTF8cs; | |||
@AllArgsConstructor @ToString(of="charset") | |||
public class BubbleCharSet { | |||
private static final Map<Charset, BubbleCharSet> cache = new ConcurrentHashMap<>(10); | |||
public static BubbleCharSet forCharSet(Charset cs) { | |||
return cache.computeIfAbsent(cs, BubbleCharSet::new); | |||
} | |||
public static final BubbleCharSet RAW = new BubbleCharSet(null); | |||
public static final BubbleCharSet UTF8 = forCharSet(UTF8cs); | |||
@Getter private final Charset charset; | |||
} |
@@ -0,0 +1,23 @@ | |||
package bubble.service.stream.charset; | |||
import java.io.InputStream; | |||
import static bubble.service.stream.charset.HtmlCharsetDetector.htmlCharSetDetector; | |||
import static org.cobbzilla.util.http.HttpContentTypes.isHtml; | |||
public interface CharsetDetector { | |||
CharsetDetector SKIP_CHARSET_DETECTION = new SkipCharsetDetection(); | |||
static CharsetDetector charSetDetectorForContentType(String contentType) { | |||
if (isHtml(contentType)) return htmlCharSetDetector(contentType); | |||
return SKIP_CHARSET_DETECTION; | |||
} | |||
BubbleCharSet getCharSet(InputStream in, long size, boolean last); | |||
class SkipCharsetDetection implements CharsetDetector { | |||
@Override public BubbleCharSet getCharSet(InputStream in, long size, boolean last) { return BubbleCharSet.RAW; } | |||
} | |||
} |
@@ -0,0 +1,43 @@ | |||
package bubble.service.stream.charset; | |||
import lombok.AllArgsConstructor; | |||
import lombok.extern.slf4j.Slf4j; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
import java.util.Map; | |||
import java.util.concurrent.ConcurrentHashMap; | |||
import static bubble.service.stream.charset.HtmlStreamCharsetDetector.HTML_STREAM_CHARSET_DETECTOR; | |||
@Slf4j | |||
public abstract class HtmlCharsetDetector implements CharsetDetector { | |||
private static final Map<String, HtmlCharsetDetector> detectors = new ConcurrentHashMap<>(10); | |||
public static final String CONTENT_TYPE_CHARSET = "charset="; | |||
public static CharsetDetector htmlCharSetDetector(String contentType) { | |||
return detectors.computeIfAbsent(contentType, ct -> { | |||
final int csPos = ct.indexOf(CONTENT_TYPE_CHARSET); | |||
if (csPos == -1) return HTML_STREAM_CHARSET_DETECTOR; | |||
final String charsetName = ct.substring(csPos + CONTENT_TYPE_CHARSET.length()); | |||
try { | |||
final Charset cs = Charset.forName(charsetName); | |||
return new HtmlContentTypeCharSet(cs); | |||
} catch (Exception e) { | |||
log.error("htmlCharSetDetector: invalid charset, returning HtmlStreamCharsetDetector: "+charsetName); | |||
return HTML_STREAM_CHARSET_DETECTOR; | |||
} | |||
}); | |||
} | |||
@AllArgsConstructor | |||
public static class HtmlContentTypeCharSet extends HtmlCharsetDetector { | |||
private final Charset charset; | |||
@Override public BubbleCharSet getCharSet(InputStream in, long size, boolean last) { | |||
return BubbleCharSet.forCharSet(charset); | |||
} | |||
} | |||
} |
@@ -0,0 +1,92 @@ | |||
package bubble.service.stream.charset; | |||
import lombok.extern.slf4j.Slf4j; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
import java.util.regex.Matcher; | |||
import java.util.regex.Pattern; | |||
import static bubble.service.stream.StreamConstants.MIN_BYTES_BEFORE_WRAP; | |||
import static org.cobbzilla.util.daemon.ZillaRuntime.shortError; | |||
import static org.cobbzilla.util.string.StringUtil.UTF8cs; | |||
@Slf4j | |||
public class HtmlStreamCharsetDetector extends HtmlCharsetDetector { | |||
public static final HtmlStreamCharsetDetector HTML_STREAM_CHARSET_DETECTOR = new HtmlStreamCharsetDetector(); | |||
private static final Pattern HTML_CONTENT_TYPE_EQUIV_CHARSET | |||
= Pattern.compile("<meta\\s+http-equiv\\s*=\\s*\"Content-Type\"\\s+content=\"[/\\w]+\\s*;\\s*charset=([-\\w]+)\\s*\"\\s*>", Pattern.CASE_INSENSITIVE); | |||
private static final Pattern HTML_META_CHARSET | |||
= Pattern.compile("<meta\\s+charset\\s*=\\s*\"([-\\w]+)\">", Pattern.CASE_INSENSITIVE); | |||
private static final Pattern HTML_CLOSE_HEAD | |||
= Pattern.compile("</head[^>]*>", Pattern.CASE_INSENSITIVE); | |||
@Override public BubbleCharSet getCharSet(InputStream in, long size, boolean last) { | |||
final byte[] buffer = new byte[(int) MIN_BYTES_BEFORE_WRAP]; | |||
int count; | |||
String fullData = null; | |||
try { | |||
final StringBuilder b = new StringBuilder(); | |||
int bytesRead = 0; | |||
boolean zeroRead = false; | |||
while (bytesRead < size && (count = in.read(buffer, 0, readSize(size, buffer.length, bytesRead))) != -1) { | |||
if (count == 0) { | |||
// reached end of multi-stream, if this is our second zero read, bail out | |||
if (zeroRead) { | |||
if (last) { | |||
if (log.isDebugEnabled()) log.debug("getCharSet: exhausted stream and no match found, returning UTF-8"); | |||
return BubbleCharSet.UTF8; | |||
} | |||
if (log.isDebugEnabled()) log.debug("getCharSet: two zero reads, must be at end of multi-stream, returning null"); | |||
return null; | |||
} | |||
zeroRead = true; | |||
} | |||
final String data = new String(buffer, 0, count); | |||
b.append(data); | |||
fullData = b.toString(); | |||
final Matcher metaMatcher = HTML_META_CHARSET.matcher(fullData); | |||
if (metaMatcher.find()) { | |||
return BubbleCharSet.forCharSet(safeCharSet(metaMatcher.group(1))); | |||
} | |||
final Matcher equivMatcher = HTML_CONTENT_TYPE_EQUIV_CHARSET.matcher(fullData); | |||
if (equivMatcher.find()) { | |||
return BubbleCharSet.forCharSet(safeCharSet(equivMatcher.group(1))); | |||
} | |||
final Matcher headCloseMatcher = HTML_CLOSE_HEAD.matcher(fullData); | |||
if (headCloseMatcher.find()) { | |||
if (log.isDebugEnabled()) log.debug("getCharSet: found head closing tag before any charset specifier, returning UTF-8"); | |||
return BubbleCharSet.UTF8; | |||
} | |||
} | |||
if (last) { | |||
if (log.isDebugEnabled()) log.debug("getCharSet: exhausted stream and no match found, returning UTF-8"); | |||
return BubbleCharSet.UTF8; | |||
} | |||
if (log.isDebugEnabled()) log.debug("getCharSet: exhausted stream and no match found, but more data may be coming, returning null"); | |||
return null; | |||
} catch (Exception e) { | |||
log.error("getCharSet: io error, returning UTF-8: "+shortError(e)); | |||
return BubbleCharSet.UTF8; | |||
} | |||
} | |||
private Charset safeCharSet(String csName) { | |||
try { | |||
return Charset.forName(csName); | |||
} catch (Exception e) { | |||
log.error("safeCharSet: invalid name, returning UTF-8: "+csName); | |||
return UTF8cs; | |||
} | |||
} | |||
private int readSize(long size, int bufsiz, int bytesRead) { | |||
return bytesRead + bufsiz < size ? bufsiz : (int) (bufsiz - (size - bytesRead)); | |||
} | |||
} |
@@ -0,0 +1,138 @@ | |||
package bubble.test.filter; | |||
import bubble.model.app.AppMatcher; | |||
import bubble.model.app.AppRule; | |||
import bubble.resources.stream.FilterHttpRequest; | |||
import bubble.resources.stream.FilterMatchersRequest; | |||
import bubble.resources.stream.FilterMatchersResponse; | |||
import bubble.service.stream.ActiveStreamState; | |||
import bubble.service.stream.AppRuleHarness; | |||
import org.apache.commons.io.IOUtils; | |||
import org.cobbzilla.util.collection.SingletonList; | |||
import org.cobbzilla.util.http.HttpContentEncodingType; | |||
import org.junit.Test; | |||
import java.io.ByteArrayInputStream; | |||
import java.io.ByteArrayOutputStream; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
import static org.cobbzilla.util.http.HttpContentEncodingType.*; | |||
import static org.cobbzilla.util.http.HttpContentTypes.TEXT_HTML; | |||
import static org.cobbzilla.util.io.StreamUtil.stream2bytes; | |||
import static org.cobbzilla.util.security.ShaUtil.sha256_base64; | |||
import static org.junit.Assert.*; | |||
public class CharsetDetectionTest { | |||
public static final byte[] WIN_1250_TEST = stream2bytes("charset_detection/meta-windows-1250.html"); | |||
public static final byte[] WIN_1250_LATE_TEST = stream2bytes("charset_detection/meta-windows-1250-late.html"); | |||
public static final byte[] WIN_1250_EQUIV_TEST = stream2bytes("charset_detection/equiv-windows-1250.html"); | |||
@Test public void testNonUTF8Charset () throws Exception { | |||
// read first chunk exactly 288 bytes, so it ends in the middle of "charset" | |||
_testNonUTF8Charset(WIN_1250_TEST, 288, null); | |||
} | |||
@Test public void testNonUTF8Charset_gzip () throws Exception { | |||
// for gzip we won't be able to break exactly on charset, | |||
// but try a small read anyway to make sure nothing breaks | |||
_testNonUTF8Charset(gzip.encode(WIN_1250_TEST), 288, gzip); | |||
} | |||
@Test public void testNonUTF8CharsetLate () throws Exception { | |||
_testNonUTF8Charset(WIN_1250_LATE_TEST, 1024, null); | |||
} | |||
@Test public void testNonUTF8CharsetLate_brotli () throws Exception { | |||
_testNonUTF8Charset(br.encode(WIN_1250_LATE_TEST), 1024, br); | |||
} | |||
@Test public void testNonUTF8CharsetEquiv () throws Exception { | |||
_testNonUTF8Charset(WIN_1250_EQUIV_TEST, 1024, null); | |||
} | |||
@Test public void testNonUTF8CharsetEquiv_deflate () throws Exception { | |||
_testNonUTF8Charset(deflate.encode(WIN_1250_EQUIV_TEST), 1024, deflate); | |||
} | |||
private void _testNonUTF8Charset(byte[] test, int initialReadSize, HttpContentEncodingType encoding) throws Exception { | |||
final FilterHttpRequest request = new FilterHttpRequest() | |||
.setMatchersResponse(new FilterMatchersResponse() | |||
.setMatchers(new SingletonList<>(new AppMatcher())) | |||
.setRequest(new FilterMatchersRequest() | |||
.setFqdn("example.com") | |||
.setUri("/test_"+sha256_base64(test)+".html"))) | |||
.setContentType(TEXT_HTML) | |||
.setEncoding(encoding); | |||
final List<AppRuleHarness> rules = new ArrayList<>(); | |||
final PassthruDriver driver = new PassthruDriver(); | |||
final AppRuleHarness passthruRuleHarness = passthruRuleHarness(driver); | |||
rules.add(passthruRuleHarness); | |||
final ActiveStreamState streamState = new ActiveStreamState(request, rules); | |||
final byte[] buffer = new byte[8192]; | |||
final ByteArrayInputStream in = new ByteArrayInputStream(test); | |||
final ByteArrayOutputStream out = new ByteArrayOutputStream(test.length); | |||
// add first chunk, no charset yet found | |||
final byte[] buf = new byte[8192]; | |||
final int initialActualRead = in.read(buffer, 0, initialReadSize); | |||
assertEquals("expected first read to read "+initialReadSize+"bytes", initialReadSize, initialActualRead); | |||
System.arraycopy(buffer, 0, buf, 0, initialReadSize); | |||
streamState.addChunk(new ByteArrayInputStream(buf, 0, initialReadSize), initialReadSize); | |||
Charset charset = driver.getLastSeenCharset(); | |||
assertNull("expected no charset to be found in the first chunk", charset); | |||
// do not expect to have found a charset yet | |||
InputStream response = streamState.getResponseStream(false); | |||
IOUtils.copyLarge(response, out); | |||
// add remaining chunks, while reading data back | |||
int count; | |||
Charset lastSeenCharset = null; | |||
int responseCount = 0; | |||
while ((count = in.read(buffer)) != -1) { | |||
System.arraycopy(buffer, 0, buf, 0, count); | |||
streamState.addChunk(new ByteArrayInputStream(buf, 0, count), count); | |||
response = streamState.getResponseStream(false); | |||
responseCount++; | |||
charset = driver.getLastSeenCharset(); | |||
if (charset != null) { | |||
if (lastSeenCharset == null) { | |||
lastSeenCharset = charset; | |||
} else { | |||
// charset cannot change | |||
assertEquals("expected charset to be same as lastSeenCharset", lastSeenCharset, charset); | |||
} | |||
// charset must be windows-1250 | |||
assertEquals("expected windows-1250 charset", "windows-1250", charset.name()); | |||
} | |||
IOUtils.copyLarge(response, out); | |||
} | |||
assertNotNull("expected to find a charset", lastSeenCharset); | |||
assertEquals("expected windows-1250 charset", "windows-1250", lastSeenCharset.name()); | |||
// add last empty chunk | |||
streamState.addLastChunk(new ByteArrayInputStream(new byte[0]), 0); | |||
// read the data back | |||
response = streamState.getResponseStream(true); | |||
IOUtils.copyLarge(response, out); | |||
final byte[] actualBytes = out.toByteArray(); | |||
final String expectedHtml = new String(encoding == null ? test : encoding.decode(test), lastSeenCharset); | |||
final String actualHtml = new String(encoding == null ? actualBytes : encoding.decode(actualBytes), lastSeenCharset); | |||
assertEquals("expected output to be same as input", expectedHtml, actualHtml); | |||
} | |||
private AppRuleHarness passthruRuleHarness(PassthruDriver driver) { | |||
final AppRuleHarness appRuleHarness = new AppRuleHarness(new AppMatcher(), new AppRule()); | |||
appRuleHarness.setDriver(driver); | |||
return appRuleHarness; | |||
} | |||
} |
@@ -0,0 +1,21 @@ | |||
package bubble.test.filter; | |||
import bubble.resources.stream.FilterHttpRequest; | |||
import bubble.rule.AbstractAppRuleDriver; | |||
import lombok.Getter; | |||
import java.io.InputStream; | |||
import java.nio.charset.Charset; | |||
public class PassthruDriver extends AbstractAppRuleDriver { | |||
@Getter private Charset lastSeenCharset; | |||
@Override public InputStream doFilterResponse(FilterHttpRequest filterRequest, InputStream in, Charset charset) { | |||
this.lastSeenCharset = charset; | |||
return super.doFilterResponse(filterRequest, in, charset); | |||
} | |||
@Override public boolean couldModify(FilterHttpRequest request) { return true; } | |||
} |
@@ -73,6 +73,7 @@ | |||
<include>bubble.test.promo.AccountCreditTest</include> | |||
<include>bubble.test.promo.MultiplePromotionsTest</include> | |||
<include>bubble.test.system.DriverTest</include> | |||
<include>bubble.test.filter.CharsetDetectionTest</include> | |||
<include>bubble.test.filter.ProxyTest</include> | |||
<include>bubble.test.filter.TrafficAnalyticsTest</include> | |||
<include>bubble.test.filter.BlockSummaryTest</include> | |||
@@ -80,6 +81,7 @@ | |||
<include>bubble.test.system.BackupTest</include> | |||
<include>bubble.test.system.NetworkTest</include> | |||
<include>bubble.abp.spec.BlockListTest</include> | |||
<include>org.cobbzilla.util.io.regex.RegexFilterReaderTest</include> | |||
</includes> | |||
</configuration> | |||
</plugin> | |||
@@ -1 +1 @@ | |||
Subproject commit bcb92061c3482ba4803c0030c78fb550206a0511 | |||
Subproject commit 2324f5d196fa52931f07e85c830bf9c10465b8f8 |
@@ -1 +1 @@ | |||
Subproject commit 1cfe96e59ca2c98955a379e539183a5596dab33f | |||
Subproject commit a9d3d69f1112a47b8d69f9343cc66caf5bab1383 |
@@ -1 +1 @@ | |||
Subproject commit cd9cf18c90a373f024163c048d57cf574ec97293 | |||
Subproject commit e91152f296a04ce1e4363ae90e1cfde70cdee4d5 |