Clean and fix html with HtmlCleaner java library

package com.origami.sgm.util;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;

/**
 *
 * @author Fernando
 */
public abstract class HtmlUtil {
    
    public static String cleanHtml(String htmlText){
        
        HtmlCleaner cleaner = new HtmlCleaner();
        
        CleanerProperties props = cleaner.getProperties();
        
        TagNode node = cleaner.clean(htmlText);
        
        SimpleHtmlSerializer htmlSerializer = new SimpleHtmlSerializer(props);
        
        // remove <?xml definition tag:
        String htmlResult = htmlSerializer.getAsString(node).replaceAll("\\<\\?xml(.+?)\\?\\>", "").trim();
        
        return htmlResult;
    }
    
}
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s