UrlRewriter.java
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.esigate.impl;
import java.net.URI;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.esigate.util.UriUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* "fixes" links to resources, images and pages in pages retrieved by esigate :
* <ul>
* <li>Current-path-relative urls are converted to full path relative urls ( img/test.img ->
* /myapp/curentpath/img/test.img)</li>
* <li>All relative urls can be converted to absolute urls (including server name)</li>
* </ul>
*
* This enables use of esigate without any special modifications of the generated urls on the provider side.
*
* All href and src attributes are processed, except javascript links.
*
* @author Nicolas Richeton
*
*/
public class UrlRewriter {
private static final Logger LOG = LoggerFactory.getLogger(UrlRewriter.class);
private static final Pattern URL_PATTERN = Pattern.compile(
"<([^\\!:>]+)(src|href|action|background|content)\\s*=\\s*('[^<']*'|\"[^<\"]*\")([^>]*)>",
Pattern.CASE_INSENSITIVE);
private static final Pattern JAVASCRIPT_CONCATENATION_PATTERN = Pattern.compile(
"\\+\\s*'|\\+\\s*\"|'\\s*\\+|\"\\s*\\+", Pattern.CASE_INSENSITIVE);
private static final Pattern META_REFRESH_PATTERN = Pattern.compile(
"<\\s*meta([^>]+)http-equiv\\s*=\\s*(\"|')refresh(\"|')", Pattern.CASE_INSENSITIVE);
/**
* Rewrites urls from the response for the client or from the request to the target server.
*
* If mode is ABSOLUTE, all relative urls will be replaced by the full urls :
* <ul>
* <li>images/image.png is replaced by http://server/context/images/image.png</li>
* <li>/context/images/image.png is replaced by http://server/context/images/image.png</li>
* </ul>
*
* If mode is RELATIVE, context will be added to relative urls :
* <ul>
* <li>images/image.png is replaced by /context/images/image.png</li>
* </ul>
*
*
*/
public UrlRewriter() {
}
/**
* Fixes a referer url in a request.
*
* @param referer
* the url to fix (can be anything found in an html page, relative, absolute, empty...)
* @param baseUrl
* The base URL selected for this request.
* @param visibleBaseUrl
* The base URL viewed by the browser.
*
* @return the fixed url.
*/
public String rewriteReferer(String referer, String baseUrl, String visibleBaseUrl) {
URI uri = UriUtils.createURI(referer);
// Base url should end with /
if (!baseUrl.endsWith("/")) {
baseUrl = baseUrl + "/";
}
URI baseUri = UriUtils.createURI(baseUrl);
// If no visible url base is defined, use base url as visible base url
if (!visibleBaseUrl.endsWith("/")) {
visibleBaseUrl = visibleBaseUrl + "/";
}
URI visibleBaseUri = UriUtils.createURI(visibleBaseUrl);
// Relativize url to visible base url
URI relativeUri = visibleBaseUri.relativize(uri);
// If the url is unchanged do nothing
if (relativeUri.equals(uri)) {
LOG.debug("url kept unchanged: [{}]", referer);
return referer;
}
// Else rewrite replacing baseUrl by visibleBaseUrl
URI result = baseUri.resolve(relativeUri);
LOG.debug("referer fixed: [{}] -> [{}]", referer, result);
return result.toString();
}
/**
* Fixes an url according to the chosen mode.
* <p>
* Note: urls starting with an ESI variable are not rewriten.
*
* @param url
* the url to fix (can be anything found in an html page, relative, absolute, empty...)
* @param requestUrl
* The incoming request URL (could be absolute or relative to visible base url).
* @param baseUrl
* The base URL selected for this request.
* @param visibleBaseUrl
* The base URL viewed by the browser.
* @param absolute
* Should the rewritten urls contain the scheme host and port
*
* @return the fixed url.
*/
public String rewriteUrl(String url, String requestUrl, String baseUrl, String visibleBaseUrl, boolean absolute) {
// Do not rewrite Urls starting with ESI variables
// This could be improved by detecting we are in an 'esi:vars' block,
// but this would link the rewriter with ESI parsing.
if (url.startsWith("$(")) {
return url;
}
// Base url should end with /
if (!baseUrl.endsWith("/")) {
baseUrl = baseUrl + "/";
}
URI baseUri = UriUtils.createURI(baseUrl);
// If no visible url base is defined, use base url as visible base url
if (!visibleBaseUrl.endsWith("/")) {
visibleBaseUrl = visibleBaseUrl + "/";
}
URI visibleBaseUri = UriUtils.createURI(visibleBaseUrl);
// Build the absolute Uri of the request sent to the backend
URI requestUri;
if (requestUrl.startsWith(visibleBaseUrl)) {
requestUri = UriUtils.createURI(requestUrl);
} else {
requestUri = UriUtils.concatPath(baseUri, requestUrl);
}
// Interpret the url relatively to the request url (may be relative)
URI uri = UriUtils.resolve(url, requestUri);
// Normalize the path (remove . or .. if possible)
uri = uri.normalize();
// Try to relativize url to base url
URI relativeUri = baseUri.relativize(uri);
// If the url is unchanged do nothing
if (relativeUri.equals(uri)) {
LOG.debug("url kept unchanged: [{}]", url);
return url;
}
// Else rewrite replacing baseUrl by visibleBaseUrl
URI result = visibleBaseUri.resolve(relativeUri);
// If mode relative, remove all the scheme://host:port to keep only a url relative to server root (starts with
// "/")
if (!absolute) {
result = UriUtils.removeServer(result);
}
LOG.debug("url fixed: [{}] -> [{}]", url, result);
return result.toString();
}
/**
* Fixes all resources urls and returns the result.
*
* @param input
* The html to be processed.
*
* @param requestUrl
* The request URL.
* @param baseUrlParam
* The base URL selected for this request.
* @param visibleBaseUrl
* The base URL viewed by the browser.
* @param absolute
* Should the rewritten urls contain the scheme host and port
*
* @return the result of this renderer.
*/
public CharSequence rewriteHtml(CharSequence input, String requestUrl, String baseUrlParam, String visibleBaseUrl,
boolean absolute) {
StringBuffer result = new StringBuffer(input.length());
Matcher m = URL_PATTERN.matcher(input);
while (m.find()) {
String url = input.subSequence(m.start(3) + 1, m.end(3) - 1).toString();
String tag = m.group(0);
String quote = input.subSequence(m.end(3) - 1, m.end(3)).toString();
// Browsers tolerate urls with white spaces before or after
String trimmedUrl = StringUtils.trim(url);
String rewrittenUrl = url;
trimmedUrl = unescapeHtml(trimmedUrl);
if (trimmedUrl.isEmpty()) {
LOG.debug("empty url kept unchanged");
} else if (trimmedUrl.startsWith("#")) {
LOG.debug("anchor url kept unchanged: [{}]", url);
} else if (JAVASCRIPT_CONCATENATION_PATTERN.matcher(trimmedUrl).find()) {
LOG.debug("url in javascript kept unchanged: [{}]", url);
} else if (m.group(2).equalsIgnoreCase("content")) {
if (META_REFRESH_PATTERN.matcher(tag).find()) {
rewrittenUrl = rewriteRefresh(trimmedUrl, requestUrl, baseUrlParam, visibleBaseUrl);
rewrittenUrl = escapeHtml(rewrittenUrl);
LOG.debug("refresh url [{}] rewritten [{}]", url, rewrittenUrl);
} else {
LOG.debug("content attribute kept unchanged: [{}]", url);
}
} else {
rewrittenUrl = rewriteUrl(trimmedUrl, requestUrl, baseUrlParam, visibleBaseUrl, absolute);
rewrittenUrl = escapeHtml(rewrittenUrl);
LOG.debug("url [{}] rewritten [{}]", url, rewrittenUrl);
}
m.appendReplacement(result, ""); // Copy what is between the previous match and the current match
result.append("<");
result.append(m.group(1));
result.append(m.group(2));
result.append("=");
result.append(quote);
result.append(rewrittenUrl);
result.append(quote);
if (m.groupCount() > 3) {
result.append(m.group(4));
}
result.append(">");
}
m.appendTail(result); // Copy the reminder of the input
return result;
}
private String unescapeHtml(String url) {
// Unescape entities, ex: ' or '
url = StringEscapeUtils.unescapeHtml4(url);
return url;
}
private String escapeHtml(String url) {
// Escape the previously unescaped characters
url = StringEscapeUtils.escapeHtml4(url);
// Replace " by " in order not to break the html
url = url.replaceAll("'", "'");
url = url.replaceAll("\"", """);
return url;
}
/**
* Rewrites a "Refresh" HTTP header or a <meta http-equiv="refresh"... tag. The value should have the following
* format:
*
* Refresh: 5; url=http://www.example.com
*
* @param input
* The refresh value to be rewritten.
* @param requestUrl
* The request URL.
* @param baseUrl
* The base URL selected for this request.
* @param visibleBaseUrl
* The base URL viewed by the browser.
* @return the rewritten refresh value
*/
public String rewriteRefresh(String input, String requestUrl, String baseUrl, String visibleBaseUrl) {
// Header has the following format
// Refresh: 5; url=http://www.w3.org/pub/WWW/People.html
int urlPosition = input.indexOf("url=");
if (urlPosition >= 0) {
String urlValue = input.substring(urlPosition + "url=".length());
String targetUrlValue = rewriteUrl(urlValue, requestUrl, baseUrl, visibleBaseUrl, true);
return input.substring(0, urlPosition) + "url=" + targetUrlValue;
} else {
return input;
}
}
}