HtmlCharsetProcessor.java

package org.esigate.extension;

import java.nio.charset.Charset;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.esigate.Driver;
import org.esigate.events.Event;
import org.esigate.events.EventDefinition;
import org.esigate.events.EventManager;
import org.esigate.events.IEventListener;
import org.esigate.events.impl.ReadEntityEvent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This extension reads html and xhtml documents, and ensure document has been read using the right charset. This
 * prevents charset issues when the remote server provides a wrong charset or no charset at all in HTTP headers even if
 * document is not ISO-8859-1 (the default).
 * 
 * <p>
 * To be processed by this extension, documents must have one of the following MIME types:
 * <ul>
 * <li>text/html</li>
 * <li>application/xhtml+xml</li>
 * </ul>
 * ... and this MIME type must be declared as parsableContentTypes in configuration file (esigate.properties).
 * 
 * @see <a href="http://www.esigate.org/reference.html#Configuration_file">Configuration file</a>
 * 
 * @author Nicolas Richeton
 * 
 */
public class HtmlCharsetProcessor implements Extension, IEventListener {
    private static final Logger LOG = LoggerFactory.getLogger(DefaultCharset.class);

    private static final Pattern PATTERN_META_HTML5 = Pattern.compile(
            ".*<head>.*<meta[^>]+charset=\"([^>^\"]+)\"[^>]*/?>.*</head>.*", Pattern.CASE_INSENSITIVE);
    private static final Pattern PATTERN_META_HTML4_XHTML = Pattern.compile(
            ".*<head>.*<meta[^>]+charset=([^>^\"]+)\"[^>]*/?>.*</head>.*", Pattern.CASE_INSENSITIVE);

    @Override
    public boolean event(EventDefinition id, Event event) {
        ReadEntityEvent readEntityEvent = (ReadEntityEvent) event;
        Charset charset = null;

        LOG.debug("Content mime type is {}", readEntityEvent.getMimeType());

        // Detect on supported MIME types.
        // ReadEntityEvent is only sent when esigate tries to parse a document.
        if ("text/html".equals(readEntityEvent.getMimeType())
                || "application/xhtml+xml".equals(readEntityEvent.getMimeType())) {
            LOG.debug("Supported MIME type, parsing content");

            Matcher m = PATTERN_META_HTML5.matcher(readEntityEvent.getEntityContent());
            if (m.matches()) {
                LOG.debug("Found HTML5 charset");
                charset = Charset.forName(m.group(1));
            }

            m = PATTERN_META_HTML4_XHTML.matcher(readEntityEvent.getEntityContent());
            if (m.matches()) {
                LOG.debug("Found HTML/XHTML charset");
                charset = Charset.forName(m.group(1));
            }
        }

        // If another charset was found, update String object
        if (charset != null && !charset.equals(readEntityEvent.getCharset())) {
            LOG.debug("Changing charset fom {} to {}", readEntityEvent.getCharset(), charset);
            readEntityEvent.setEntityContent(new String(readEntityEvent.getRawEntityContent(), charset));
        }

        return true;
    }

    @Override
    public void init(Driver driver, Properties properties) {
        driver.getEventManager().register(EventManager.EVENT_READ_ENTITY, this);
    }

}