2025-02-09

問題描述,使用htmlparser的lexer解析器進行頁面解析時發現類似如下的頁面會有問題:
 
 
Js代碼 
<script> 
for(i=0;i<a;i++){ 
 

</script> 
 
解析後代碼變成瞭: 
<script> 
for(i=0;i<a;i++){ 
 

></script> 
 
 通過lexer代碼發現,實際上隻要js代碼改成:
Java代碼 
<script> 
<!– 
for(i=0;i<a;i++){ 
 

–> 
</script> 
 就不會有問題瞭,從代碼中主要發現它的解析其實沒有問題,主要是我們平時的頁面規范做的不好,它在解析時會看到字符解析時發現<後面如果有字母就認為它是一個tag:
Java代碼 
protected Node parseString(int start, boolean quotesmart) 
        throws ParserException { 
    boolean done; 
    char ch; 
    char quote; 
 
    done = false; 
    quote = 0; 
    while (!done) { 
        ch = mPage.getCharacter(mCursor); 
        if (Page.EOF == ch) 
            done = true; 
        else if (0x1b == ch) // escape 
        { 
            ch = mPage.getCharacter(mCursor); 
            if (Page.EOF == ch) 
                done = true; 
            else if ('$' == ch) { 
                ch = mPage.getCharacter(mCursor); 
                if (Page.EOF == ch) 
                    done = true; 
                // JIS X 0208-1978 and JIS X 0208-1983 
                else if ('@' == ch || 'B' == ch) 
                    scanJIS(mCursor); 
                /*
                 * // JIS X 0212-1990 else if ('(' == ch) { ch =
                 * mPage.getCharacter (mCursor); if (Page.EOF == ch) done =
                 * true; else if ('D' == ch) scanJIS (mCursor); else {
                 * mPage.ungetCharacter (mCursor); mPage.ungetCharacter
                 * (mCursor); mPage.ungetCharacter (mCursor); } }
                 */ 
                else { 
                    mPage.ungetCharacter(mCursor); 
                    mPage.ungetCharacter(mCursor); 
                } 
            } else 
                mPage.ungetCharacter(mCursor); 
        } else if (quotesmart && (0 == quote) 
                && (('\'' == ch) || ('"' == ch))) 
            quote = ch; // enter quoted state 
        // patch from Gernot Fricke to handle escaped closing quote 
        else if (quotesmart && (0 != quote) && ('\\' == ch)) { 
            ch = mPage.getCharacter(mCursor); // try to consume escape 
            if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash 
                    && (ch != quote)) // escaped quote character 
                // ( reflects ["] or ['] whichever opened the quotation) 
                mPage.ungetCharacter(mCursor); // unconsume char if char not 
                                                // an escape 
        } else if (quotesmart && (ch == quote)) 
            quote = 0; // exit quoted state 
        else if (quotesmart && (0 == quote) && (ch == '/')) { 
            // handle multiline and double slash comments (with a quote) 
            // in script like: 
            // I can't handle single quotations. 
            ch = mPage.getCharacter(mCursor); 
            if (Page.EOF == ch) 
                done = true; 
            else if ('/' == ch) { 
                do 
                    ch = mPage.getCharacter(mCursor); 
                while ((Page.EOF != ch) && ('\n' != ch)); 
            } else if ('*' == ch) { 
                do { 
                    do 
                        ch = mPage.getCharacter(mCursor); 
                    while ((Page.EOF != ch) && ('*' != ch)); 
                    ch = mPage.getCharacter(mCursor); 
                    if (ch == '*') 
                        mPage.ungetCharacter(mCursor); 
                } while ((Page.EOF != ch) && ('/' != ch)); 
            } else 
                mPage.ungetCharacter(mCursor); 
        } else if ((0 == quote) && ('<' == ch)) { 
            ch = mPage.getCharacter(mCursor); 
            if (Page.EOF == ch) 
                done = true; 
            // the order of these tests might be optimized for speed: 
            <strong>else if ('/' == ch 
                    || Character.isLetter(ch) 
                    || '!' == ch || '%' == ch || '?' == ch) {</strong> 
                done = true; 
                mPage.ungetCharacter(mCursor); 
                mPage.ungetCharacter(mCursor); 
            } else { 
                // it's not a tag, so keep going, but check for quotes 
                mPage.ungetCharacter(mCursor); 
            } 
        } 
    } 
 
    return (makeString(start, mCursor.getPosition())); 

 因此為瞭解決這個問題,現在要在上面做一個手腳:
首先在類中間增加瞭一個標記,script
這個標記是修改瞭nexNode方法,在返回前判斷下是否前一個標簽是<script> 或者</script>
然後在parseString中修改其解析方法就可以瞭,下面是完整的代碼:
Java代碼 
import java.net.URLConnection; 
 
import org.htmlparser.Node; 
import org.htmlparser.lexer.Lexer; 
import org.htmlparser.lexer.Page; 
import org.htmlparser.nodes.TagNode; 
import org.htmlparser.util.ParserException; 
import org.slf4j.Logger; 
import org.slf4j.LoggerFactory; 
 
/**
 * @author edwardpro
 * 
 */ 
public class LexerFixed extends Lexer { 
    private static final Logger logger = LoggerFactory 
            .getLogger(LexerFixed.class); 
 
    /**
     * 
     */ 
    private static final long serialVersionUID = 8425806017089419815L; 
 
    //script標簽標記,如果發現當前在script裡就掠過所有的< > 
    private int script=0; 
 
    /**
     * 
     */ 
    public LexerFixed() { 
        super(); 
    } 
 
    /**
     * @param page
     */ 
    public LexerFixed(Page page) { 
        super(page); 
    } 
 
    /**
     * @param text
     */ 
    public LexerFixed(String text) { 
        super(text); 
    } 
 
    /**
     * @param connection
     * @throws ParserException
     */ 
    public LexerFixed(URLConnection connection) throws ParserException { 
        super(connection); 
    } 
 
    @Override 
    public Node nextNode(boolean quotesmart) throws ParserException { 
        Node ret = super.nextNode(quotesmart); 
        checkTag(ret); 
        return (ret); 
    } 
 
    /**
     * checkTag用於修改tagNode的方法當有入參數時都會進行一次參數修正另外對內容進行一下escape操作並且會進行判斷是否存在已經escape的蹟象
     * 
     * @param node
     */ 
    private void checkTag(Node node) { 
        if (node != null && node instanceof TagNode 
                && !((TagNode) node).isEmptyXmlTag()) { 
            String tagName = ((TagNode) node).getTagName(); 
            if("SCRIPT".equalsIgnoreCase(tagName)){ 
                if (!((TagNode) node).isEndTag() ) { 
                    this.script=1; 
                } else{ 
                    this.script=0; 
                } 
            } 
        } 
    } 
 
    @Override 
    protected Node parseString(int start, boolean quotesmart) 
            throws ParserException { 
        boolean done; 
        char ch; 
        char quote; 
 
        done = false; 
        quote = 0; 
        while (!done) { 
            ch = mPage.getCharacter(mCursor); 
            if (Page.EOF == ch) 
                done = true; 
            else if (0x1b == ch) // escape 
            { 
                ch = mPage.getCharacter(mCursor); 
                if (Page.EOF == ch) 
                    done = true; 
                else if ('$' == ch) { 
                    ch = mPage.getCharacter(mCursor); 
                    if (Page.EOF == ch) 
                        done = true; 
                    // JIS X 0208-1978 and JIS X 0208-1983 
                    else if ('@' == ch || 'B' == ch) 
                        scanJIS(mCursor); 
                    /*
                     * // JIS X 0212-1990 else if ('(' == ch) { ch =
                     * mPage.getCharacter (mCursor); if (Page.EOF == ch) done =
                     * true; else if ('D' == ch) scanJIS (mCursor); else {
                     * mPage.ungetCharacter (mCursor); mPage.ungetCharacter
                     * (mCursor); mPage.ungetCharacter (mCursor); } }
                     */ 
                    else { 
                        mPage.ungetCharacter(mCursor); 
                        mPage.ungetCharacter(mCursor); 
                    } 
                } else 
                    mPage.ungetCharacter(mCursor); 
            } else if (quotesmart && (0 == quote) 
                    && (('\'' == ch) || ('"' == ch))) 
                quote = ch; // enter quoted state 
            // patch from Gernot Fricke to handle escaped closing quote 
            else if (quotesmart && (0 != quote) && ('\\' == ch)) { 
                ch = mPage.getCharacter(mCursor); // try to consume escape 
                if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash 
                        && (ch != quote)) // escaped quote character 
                    // ( reflects ["] or ['] whichever opened the quotation) 
                    mPage.ungetCharacter(mCursor); // unconsume char if char not 
                                                    // an escape 
            } else if (quotesmart && (ch == quote)) 
                quote = 0; // exit quoted state 
            else if (quotesmart && (0 == quote) && (ch == '/')) { 
                // handle multiline and double slash comments (with a quote) 
                // in script like: 
                // I can't handle single quotations. 
                ch = mPage.getCharacter(mCursor); 
                if (Page.EOF == ch) 
                    done = true; 
                else if ('/' == ch) { 
                    do 
                        ch = mPage.getCharacter(mCursor); 
                    while ((Page.EOF != ch) && ('\n' != ch)); 
                } else if ('*' == ch) { 
                    do { 
                        do 
                            ch = mPage.getCharacter(mCursor); 
                        while ((Page.EOF != ch) && ('*' != ch)); 
                        ch = mPage.getCharacter(mCursor); 
                        if (ch == '*') 
                            mPage.ungetCharacter(mCursor); 
                    } while ((Page.EOF != ch) && ('/' != ch)); 
                } else 
                    mPage.ungetCharacter(mCursor); 
            } else if ((0 == quote) && ('<' == ch)) { 
                ch = mPage.getCharacter(mCursor); 
                if (Page.EOF == ch) 
                    done = true; 
                // the order of these tests might be optimized for speed: 
                else if ('/' == ch 
                        || (Character.isLetter(ch) && this.script==0) 
                        || '!' == ch || '%' == ch || '?' == ch) { 
                    done = true; 
                    mPage.ungetCharacter(mCursor); 
                    mPage.ungetCharacter(mCursor); 
                } else { 
                    // it's not a tag, so keep going, but check for quotes 
                    mPage.ungetCharacter(mCursor); 
                } 
            } 
        } 
 
        return (makeString(start, mCursor.getPosition())); 
    } 

作者“edwardpro”
 

發佈留言

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *