Clean and fix html with HtmlCleaner java library

package com.origami.sgm.util;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;

 * @author Fernando
public abstract class HtmlUtil {
    public static String cleanHtml(String htmlText){
        HtmlCleaner cleaner = new HtmlCleaner();
        CleanerProperties props = cleaner.getProperties();
        TagNode node = cleaner.clean(htmlText);
        SimpleHtmlSerializer htmlSerializer = new SimpleHtmlSerializer(props);
        // remove <?xml definition tag:
        String htmlResult = htmlSerializer.getAsString(node).replaceAll("\\<\\?xml(.+?)\\?\\>", "").trim();
        return htmlResult;

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s