最短路徑分詞程序 – JAVA編程語言程序開發技術文章

最短路徑分詞法
public class SPM2 extends M
{
    public static final HashMap<Character,TreeNode> dic = Dictionary.loadFreqDictionary("sogou.txt");
   
    /**
     * @return 返回可能匹配詞的長度, 沒有找到返回 0.
     */
    public ArrayList<Integer> maxMatch(TreeNode node,char[] sen, int offset)
    {
        ArrayList<Integer> list=new ArrayList<Integer>();
        for(int i=offset; i<sen.length; i++)
        {
            node = node.subNode(sen[i]);
            if(node != null)
            {
                if(node.isAlsoLeaf())
                    list.add(i+1);
            }
            else
                break;
        }
        return list;
    }
   
    @Override
    public ArrayList<Token> getToken(ArrayList<Sentence> list)
    {
        ArrayList<Token> tokenlist=new ArrayList<Token>();
        for(Sentence sen:list)
        {
            AdjList g = new AdjList(sen.getText().length+1);//存儲所有被切分的可能的詞
            int i=0;
            while(i<sen.getText().length)
            {
                Token token = new Token(new String(sen.getText(),i,1),i,i+1);
                token.setWeight(1);
                g.addEdge(token);
               
                TreeNode n=dic.get(sen.getText()[i]);
                if(n!=null)
                {
                    ArrayList<Integer> ilist =maxMatch(n, sen.getText(),i);
                    if(ilist.size()>0)
                        for(int j=0;j<ilist.size();j++)
                        {
                            token = new Token(new String(sen.getText(),i,ilist.get(j)-i),i,ilist.get(j));
                            token.setWeight(1);
                            g.addEdge(token);
                        }
                }
                i++;
            }
            //System.out.println(g);
            ArrayList<Integer> ret=maxProb(g);
            Collections.reverse(ret);
            int first=0;
            for(Integer last:ret)
            {
                Token token = new Token(new String(sen.getText(),first,last-first),sen.getStartOffset()+first,sen.getStartOffset()+last);
                tokenlist.add(token);
                first=last;
            }
        }
        return tokenlist;
    }
   
    int[] prevNode;
    double[] prob;
   
    //計算出路徑最短的數組
    public ArrayList<Integer> maxProb(AdjList g)
    {
        prevNode = new int[g.verticesNum]; //最佳前驅節點
        prob = new double[g.verticesNum]; //節點路徑
        prob[0] = 0;//節點0的初始路徑是0
       
        //按節點求最佳前驅
        for (int index = 1; index < g.verticesNum; index++)
            getBestPrev(g,index);//求出最大前驅
       
        ArrayList<Integer> ret = new ArrayList<Integer>();
        for(int i=(g.verticesNum-1);i>0;i=prevNode[i]) // 從右向左找最佳前驅節點
            ret.add(i);
        return ret;
    }
   
    //計算節點i的最佳前驅節點
    void getBestPrev(AdjList g,int i)
    {
        Iterator<Token> it = g.getPrev(i);//得到前驅詞集合,從中挑選最佳前趨詞
        double maxProb = 1000;
        int maxNode = -1;
       
        while(it.hasNext())
        {
            Token itr = it.next();
            double nodeProb = prob[itr.getStart()]+itr.getWeight();//候選節點路徑
            //System.out.println(itr.getWord()+","+nodeProb);
              if (nodeProb < maxProb)//路徑最短的算作最佳前趨
              {
                  maxNode = itr.getStart();
                  maxProb = nodeProb;
              }
         }
        prob[i] = maxProb;//節點路徑
        prevNode[i] = maxNode;//最佳前驅節點
    }
}

發佈留言

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *