DefaultCharset.java

package org.esigate.extension;

import java.util.Collection;
import java.util.Properties;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.esigate.Driver;
import org.esigate.Parameters;
import org.esigate.events.Event;
import org.esigate.events.EventDefinition;
import org.esigate.events.EventManager;
import org.esigate.events.IEventListener;
import org.esigate.events.impl.FetchEvent;
import org.esigate.util.Parameter;
import org.esigate.util.ParameterString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This extension adds a default charset to responses which lack the charset attribute in Content-Type header. Only
 * parsable MIME types are modified :
 * 
 * <pre>
 * Content-Type: text/html
 * </pre>
 * 
 * becomes
 * 
 * <pre>
 * Content-Type:text/html; charset=utf-8
 * </pre>
 * 
 * <p>
 * Default charset can be set in esigate.properties using
 * 
 * <pre>
 * driverid.defaultCharset = utf - 8
 * </pre>
 * 
 * @author Nicolas Richeton
 * 
 */
public class DefaultCharset implements Extension, IEventListener {
    private static final Logger LOG = LoggerFactory.getLogger(DefaultCharset.class);
    /** default charset. */
    public static final Parameter<String> PARAM_DEFAULT_CHARSET = new ParameterString("defaultCharset", "ISO-8859-1");

    private Collection<String> parsableContentTypes;
    private String defaultCharset;

    @Override
    public void init(Driver driver, Properties properties) {
        driver.getEventManager().register(EventManager.EVENT_FETCH_POST, this);

        parsableContentTypes = Parameters.PARSABLE_CONTENT_TYPES.getValue(properties);
        defaultCharset = PARAM_DEFAULT_CHARSET.getValue(properties);

        LOG.info("Will use " + defaultCharset + " as default charset for " + parsableContentTypes.toString());

    }

    @Override
    public boolean event(EventDefinition arg0, Event arg1) {
        FetchEvent fe = (FetchEvent) arg1;

        Header contentTypeHeader = fe.getHttpResponse().getFirstHeader("Content-Type");

        // No content type, there is nothing we can do
        if (contentTypeHeader == null) {
            return true;
        }

        String contentType = contentTypeHeader.getValue();

        // Charset is present -> OK
        if (StringUtils.containsIgnoreCase(contentType, "charset")) {
            return true;
        }

        // Is document parsable
        boolean parsable = false;
        for (String parseableContentType : parsableContentTypes) {
            if (StringUtils.containsIgnoreCase(contentType, parseableContentType)) {
                parsable = true;
                break;
            }
        }

        // Add default charset
        if (parsable) {
            fe.getHttpResponse().setHeader("Content-Type", contentType + "; charset=" + defaultCharset);
        }

        return true;
    }

}