java过滤HTML中的标签获取纯文本

public static String html2Text(String htmlStr){
    Pattern scrpitPat;
    Matcher scriptMat;
    Pattern stylePat;
    Matcher styleMat;
    Pattern htmlPat;
    Matcher htmlMat;
    try{
        //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
        String scriptEx = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>";
        //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
        String styleEx = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>";
        //定义HTML标签的正则表达式
        String htmlEx = "<[^>]+>";
        scrpitPat = Pattern.compile(scriptEx,Pattern.CASE_INSENSITIVE);
        scriptMat = scrpitPat.matcher(htmlStr);
        htmlStr = scriptMat.replaceAll("");             //过滤script标签
        stylePat = Pattern.compile(styleEx,Pattern.CASE_INSENSITIVE);
        styleMat = stylePat.matcher(htmlStr);
        htmlStr = styleMat.replaceAll("");              //过滤style标签
        htmlPat = Pattern.compile(htmlEx,Pattern.CASE_INSENSITIVE);
        htmlMat = htmlPat.matcher(htmlStr);
        htmlStr = htmlMat.replaceAll("");               //过滤html标签
    }catch(Exception e){
        e.printStackTrace();
    }
    return htmlStr;//返回文本字符串
}
java过滤HTML中的标签获取纯文本

评论

博主热门

01.

02.

03.

04.

05.

06.

07.

08.

举报文章