自然语言分词处理，词频统计

川长思鸟来 2022-06-10 11:22 269阅读 0赞

目标：将一段文字做分词处理，并统计分词中出现频度最高的五组词。

代码：

ToAnalysis.java

package com.test;
    
    import java.io.Reader;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.ansj.domain.Result;
    import org.ansj.domain.Term;
    import org.ansj.recognition.arrimpl.AsianPersonRecognition;
    import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
    import org.ansj.recognition.arrimpl.NumRecognition;
    import org.ansj.recognition.arrimpl.UserDefineRecognition;
    import org.ansj.splitWord.Analysis;
    import org.ansj.util.AnsjReader;
    import org.ansj.util.Graph;
    import org.ansj.util.NameFix;
    import org.ansj.util.TermUtil.InsertTermType;
    import org.nlpcn.commons.lang.tire.domain.Forest;
    
    /**
     * 标准分词
     * 
     * @author ansj
     * 
     */
    public class ToAnalysis extends Analysis {
    
    	@Override
    	protected List<Term> getResult(final Graph graph) {
    
    		Merger merger = new Merger() {
    			@Override
    			public List<Term> merger() {
    
    				graph.walkPath();
    
    				// 数字发现
    				if (isNumRecognition && graph.hasNum) {
    					new NumRecognition().recognition(graph.terms);
    				}
    
    				// 姓名识别
    				if (graph.hasPerson && isNameRecognition) {
    					// 亚洲人名识别
    					new AsianPersonRecognition().recognition(graph.terms);
    					graph.walkPathByScore();
    					NameFix.nameAmbiguity(graph.terms);
    					// 外国人名识别
    					new ForeignPersonRecognition().recognition(graph.terms);
    					graph.walkPathByScore();
    				}
    
    				// 用户自定义词典的识别
    				userDefineRecognition(graph, forests);
    
    				return getResult();
    			}
    
    			private void userDefineRecognition(final Graph graph, Forest... forests) {
    				new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
    				graph.rmLittlePath();
    				graph.walkPathByScore();
    			}
    
    			private List<Term> getResult() {
    				List<Term> result = new ArrayList<Term>();
    				int length = graph.terms.length - 1;
    				for (int i = 0; i < length; i++) {
    					if (graph.terms[i] != null) {
    						result.add(graph.terms[i]);
    					}
    				}
    				setRealName(graph, result);
    				return result;
    			}
    		};
    		return merger.merger();
    	}
    
    	public ToAnalysis() {
    		super();
    	}
    
    	public ToAnalysis(Reader reader) {
    		super.resetContent(new AnsjReader(reader));
    	}
    	
    	public static Result parse(String str) {
    		return new ToAnalysis().parseStr(str);
    	}
    
    	public static Result parse(String str, Forest... forests) {
    		return new ToAnalysis().setForests(forests).parseStr(str);
    	}
    
    }

SameStringCount.java

package com.wordcount;
    
    import java.util.HashMap;
    
    public class SameStringCount {
    	 private HashMap map;  
    	 private int counter;
    	 public SameStringCount() {  
    	        map = new HashMap<String,Integer>();  
    	 }  
    	 
    	 public void hashInsert(String string) {  
    	    if (map.containsKey(string)) {   //判断指定的Key是否存在  
    	         counter = (Integer)map.get(string);  //根据key取得value  
    	         map.put(string, ++counter);  
    	      } else {  
    	            map.put(string, 1);  
    	    }  
    	 } 
    	 
    	 public HashMap getHashMap(){  
    	    return map;  
    	 }  
    }

Result.java

package org.ansj.domain;
    
    import java.util.Iterator;
    import java.util.List;
    
    import org.ansj.recognition.Recognition;
    import org.nlpcn.commons.lang.util.StringUtil;
    
    /**
     * 分词结果的一个封装
     * 
     * @author Ansj
     *
     */
    public class Result implements Iterable<Term> {
    
    	private List<Term> terms = null;
    
    	public Result(List<Term> terms) {
    		this.terms = terms;
    	}
    
    	public List<Term> getTerms() {
    		return terms;
    	}
    
    	public void setTerms(List<Term> terms) {
    		this.terms = terms;
    	}
    
    	@Override
    	public Iterator<Term> iterator() {
    		return terms.iterator();
    	}
    
    	public int size() {
    		return terms.size();
    	}
    
    	public Term get(int index) {
    		return terms.get(index);
    	}
    
    	/**
    	 * 调用一个发现引擎
    	 * 
    	 * @return
    	 */
    	public Result recognition(Recognition re) {
    		re.recognition(this);
    		return this;
    	}
    
    	@Override
    	public String toString() {
    		return toString(",");
    	}
    
    	
    	public String toString(String split) {
    		return StringUtil.joiner(this.terms, split);
    	}
    
    	/**
    	 * 返回没有词性的切分结果
    	 * @return
    	 */
    	public String toStringWithOutNature(){
    		return  toStringWithOutNature(",");
    	}
    	
    	/**
    	 * 返回没有词性的切分结果
    	 * @return
    	 */
    	public String toStringWithOutNature(String split) {
    		
    		if(terms==null || terms.size()==0){
    			return "" ;
    		}
    		
    		Iterator<Term> iterator = terms.iterator() ;
    		
    		StringBuilder sb = new StringBuilder(iterator.next().getRealName()) ;
    		
    		while(iterator.hasNext()){
    			sb.append(split);
    			sb.append(iterator.next().getRealName()) ;
    		}
    		
    		return sb.toString();
    	}
    
    }

Term.java

package org.ansj.domain;
    
    import java.io.Serializable;
    import java.util.List;
    import java.util.Map;
    
    import org.ansj.util.MathUtil;
    import org.nlpcn.commons.lang.util.StringUtil;
    
    public class Term implements Serializable{
    	/**
    	 * 
    	 */
    	private static final long serialVersionUID = 1L;
    	// 当前词
    	private String name;
    	//
    	private String realName;
    	// 当前词的起始位置
    	private int offe;
    	// 词性列表
    	private TermNatures termNatures = TermNatures.NULL;
    	// 词性列表
    	private AnsjItem item = AnsjItem.NULL;
    	// 同一行内数据
    	private Term next;
    	// 分数
    	private double score = 0;
    	// 本身分数
    	private double selfScore = 1;
    	// 起始位置
    	private Term from;
    	// 到达位置
    	private Term to;
    	// 本身这个term的词性.需要在词性识别之后才会有值,默认是空
    	private Nature nature = Nature.NULL;
    	//是否是一个新词
    	private boolean newWord ;
    	//同义词
    	private List<String> synonyms ;
    	
    
    	private List<Term> subTerm = null;
    
    	public Term(String name, int offe, AnsjItem item) {
    		super();
    		this.name = name;
    		this.offe = offe;
    		this.item = item;
    		if (item.termNatures != null) {
    			this.termNatures = item.termNatures;
    			if (termNatures.nature != null) {
    				this.nature = termNatures.nature;
    			}
    		}
    	}
    
    	public Term(String name, int offe, TermNatures termNatures) {
    		super();
    		this.name = name;
    		this.offe = offe;
    		this.termNatures = termNatures;
    		if (termNatures.nature != null) {
    			this.nature = termNatures.nature;
    		}
    	}
    
    	public Term(String name, int offe, String natureStr, int natureFreq) {
    		super();
    		this.name = name;
    		this.offe = offe;
    		TermNature termNature = new TermNature(natureStr, natureFreq);
    		this.nature = termNature.nature;
    		this.termNatures = new TermNatures(termNature);
    	}
    
    	// 可以到达的位置
    	public int toValue() {
    		return offe + name.length();
    	}
    
    	public int getOffe() {
    		return offe;
    	}
    
    	public void setOffe(int offe) {
    		this.offe = offe;
    	}
    
    	public String getName() {
    		return name;
    	}
    
    	public void setName(String name) {
    		this.name = name;
    	}
    
    	/**
    	 * 核心构建最优的路径
    	 * 
    	 * @param term
    	 */
    	public void setPathScore(Term from, Map<String, Double> relationMap) {
    		// 维特比进行最优路径的构建
    		double score = MathUtil.compuScore(from, this, relationMap);
    		if (this.from == null || this.score == 0 || this.score >= score) {
    			this.setFromAndScore(from, score);
    		}
    	}
    
    	/**
    	 * 核心分数的最优的路径,越小越好
    	 * 
    	 * @param term
    	 */
    	public void setPathSelfScore(Term from) {
    		double score = this.selfScore + from.score;
    		// 维特比进行最优路径的构建
    		if (this.from == null || this.score > score) {
    			this.setFromAndScore(from, score);
    		}
    	}
    
    	private void setFromAndScore(Term from, double score) {
    		this.from = from;
    		this.score = score;
    	}
    
    	/**
    	 * 进行term合并
    	 * 
    	 * @param term
    	 * @param maxNature
    	 */
    	public Term merage(Term to) {
    		this.name = this.name + to.getName();
    		if (StringUtil.isNotBlank(this.realName) && StringUtil.isNotBlank(to.getRealName())) {
    			this.realName = this.realName + to.getRealName();
    		}
    		this.setTo(to.to);
    		return this;
    	}
    
    	/**
    	 * 进行term合并,能合并空白字符
    	 * 
    	 * @param term
    	 * @param maxNature
    	 */
    	public Term merageWithBlank(Term to) {
    		this.name = this.name + to.getName();
    		this.realName = this.realName + to.getRealName();
    		this.setTo(to.to);
    		return this;
    	}
    	
    	/**
    	 * 更新偏移量
    	 * 
    	 * @param offe
    	 */
    	public void updateOffe(int offe) {
    		this.offe += offe;
    	}
    
    	public Term next() {
    		return next;
    	}
    
    	/**
    	 * 返回他自己
    	 * 
    	 * @param next
    	 *            设置他的下一个
    	 * @return
    	 */
    	public Term setNext(Term next) {
    		this.next = next;
    		return this;
    	}
    
    	public Term from() {
    		return from;
    	}
    
    	public Term to() {
    		return to;
    	}
    
    	public void setFrom(Term from) {
    		this.from = from;
    	}
    
    	public void setTo(Term to) {
    		this.to = to;
    	}
    
    	/**
    	 * 获得这个term的所有词性
    	 * 
    	 * @return
    	 */
    	public TermNatures termNatures() {
    		return termNatures;
    	}
    
    	public void setNature(Nature nature) {
    		this.nature = nature;
    	}
    
    	/**
    	 * 获得这个词的词性.词性计算后才可生效
    	 * 
    	 * @return
    	 */
    	public Nature natrue() {
    		return nature;
    	}
    
    	public String getNatureStr() {
    		return nature.natureStr;
    	}
    
    	@Override
    	public String toString() {
    		if ("null".equals(nature.natureStr)) {
    			return this.getRealName();
    		}	
    	  //return this.getRealName() + "/" + nature.natureStr;
    		return this.getRealName();    //自己修改的地方(lhy)
    	}
    
    	/**
    	 * 将term的所有分数置为0
    	 */
    	public void clearScore() {
    		this.score = 0;
    		this.selfScore = 0;
    	}
    
    	public void setSubTerm(List<Term> subTerm) {
    		this.subTerm = subTerm;
    		 
    	}
    
    	public List<Term> getSubTerm() {
    		return subTerm;
    	}
    
    	public String getRealName() {
    		if (realName == null) {
    			return name;
    		}
    		return realName;
    	}
    
    	public void setRealName(String realName) {
    		this.realName = realName;
    	}
    
    	public double score() {
    		return this.score;
    	}
    
    	public void score(double score) {
    		this.score = score;
    	}
    
    	public double selfScore() {
    		return this.selfScore;
    	}
    
    	public void selfScore(double selfScore) {
    		this.selfScore = selfScore;
    	}
    
    	public AnsjItem item() {
    		return this.item;
    	}
    
    	public boolean isNewWord() {
    		return newWord;
    	}
    
    	public void setNewWord(boolean newWord) {
    		this.newWord = newWord;
    	}
    
    	public void updateTermNaturesAndNature(TermNatures termNatures) {
    		this.termNatures = termNatures;
    		this.nature = termNatures.nature ;
    	}
    
    	public List<String> getSynonyms() {
    		return synonyms;
    	}
    
    	public void setSynonyms(List<String> synonyms) {
    		this.synonyms = synonyms;
    	}
    	
    }

WordCount.java

package com.wordcount;
    
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.Comparator;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    import org.ansj.splitWord.analysis.ToAnalysis;
    import org.ansj.util.MyStaticValue;
    
    /*
     * 功能：分词频度最高的五组词，和对应的频度
     * 作者：lhy
     * 时间：2017年8月17日
     */
    public class WordCount {
        public HashMap<String , Integer> getGarticiples(String str){
    		MyStaticValue.isNumRecognition = true ;
    		MyStaticValue.isQuantifierRecognition = false ;
    		String[]ss;
    		String s = ToAnalysis.parse(str).toString(); 
    		ss = s.split(",");
    		SameStringCount Count = new SameStringCount();
    		
    		String regex = "([\u4e00-\u9fa5]+){2,10}";    //匹配两个以上中文的正则表达式
    		for(int i=0; i<ss.length; i++){
    			boolean flag = match(regex, ss[i].toString());
    			if(flag){
    				Count.hashInsert(ss[i]);   //添加该分词
    			}
    		}
    		
    		HashMap map = Count.getHashMap();
    		HashMap<String, Integer> news = new HashMap<String,Integer>();      //定义一个新的哈希图来保存分词
    	    String temp ;
    	    List<Map.Entry<String,Integer>> list=new ArrayList<>();   
    	    list.addAll(map.entrySet());  
    	    WordCount.ValueComparator vc=new ValueComparator();  
    	    Collections.sort(list,vc);      //分词频度的排序
    	    int num = 0;
    	    Iterator<Map.Entry<String, Integer>> it = list.iterator();
    	    while (it.hasNext()) {
    	       if(num == 5){
    	    	   break;
    	       }
    	       Map.Entry<String, Integer> entry = it.next();
    //	       System.out.println("key=" + entry.getKey() + ",value=" + entry.getValue());
    	       news.put(entry.getKey(), entry.getValue());
    	       num++;
    	   }
    	    return news;     //返回频度最高的五组词
        }
        
    	//正则表达式的判定
    	private static boolean match(String regex, String str) {
    		Pattern pattern = Pattern.compile(regex);
    		Matcher matcher = pattern.matcher(str);
    		return matcher.matches();
        }
    	
    	 private static class ValueComparator implements Comparator<Map.Entry<String,Integer>> {  
    	    public int compare(Map.Entry<String,Integer> m,Map.Entry<String,Integer> n){  
    		      return n.getValue()-m.getValue();  
    		 }  
    	 }  
    }

WordDegrees.java

package com.wordcount;
    
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.Comparator;
    import java.util.HashMap;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.ansj.splitWord.analysis.ToAnalysis;
    import org.ansj.util.MyStaticValue;
    
    /*
     * 功能：分词频度最高的五组词,返回五组词，空格相连
     * 作者：lhy
     * 时间：2017年8月17日
     */
    public class WordDegrees {
        public String WordProcessing(String str){
    		MyStaticValue.isNumRecognition = true ;
    		MyStaticValue.isQuantifierRecognition = false ;
    		String[]ss;
    		String s = ToAnalysis.parse(str).toString(); 
    		ss = s.split(",");
    		SameStringCount Count = new SameStringCount();
    		
    		String regex = "([\u4e00-\u9fa5]+){2,10}";    //匹配两个以上中文的正则表达式
    		for(int i=0; i<ss.length; i++){
    			boolean flag = match(regex, ss[i].toString());
    			if(flag){
    				Count.hashInsert(ss[i]);   //添加该分词
    			}
    		}
    		
    		HashMap map = Count.getHashMap();
    	    String temp ;
    	    List<Map.Entry<String,Integer>> list=new ArrayList<>();   
    	    list.addAll(map.entrySet());  
    	    WordDegrees.ValueComparator vc=new ValueComparator();  
    	    Collections.sort(list,vc);      //分词频度的排序
    	    int num = 0;
    	    String key="";    //存放五组分词的字符串
    	    Iterator<Map.Entry<String, Integer>> it = list.iterator();
    	    while (it.hasNext()) {
    	       if(num == 5){
    	    	   break;
    	       }
    	       Map.Entry<String, Integer> entry = it.next();
    //	       System.out.println("key=" + entry.getKey() + ",value=" + entry.getValue());
    	       key +=entry.getKey().toString()+" ";
    	       num++;
    	   }
    	    return key;     //返回频度最高的五组词
        }
        
    	//正则表达式的判定
    	private static boolean match(String regex, String str) {
    		Pattern pattern = Pattern.compile(regex);
    		Matcher matcher = pattern.matcher(str);
    		return matcher.matches();
        }
    	
    	 private static class ValueComparator implements Comparator<Map.Entry<String,Integer>> {  
    	    public int compare(Map.Entry<String,Integer> m,Map.Entry<String,Integer> n){  
    		      return n.getValue()-m.getValue();  
    		 }  
    	 }  
    }

Test.java

package com.wordcount;
    
    import java.util.HashMap;
    
    public class Test {
    
    	public static void main(String[] args) {
    		// TODO Auto-generated method stub
    	   String str = "《热爱生命》,可以说是汪国真的代表作之一,这首诗以四个肯定的回答表达出为何要热爱生命的哲理.四个段落,看似相似,却各有其趣.四个段落分别以“成功”、“爱情”、“奋斗历程”和“未来”为意象进行分析和回答.这四个意象可以说是包括汪国真、席慕容在内的一些清新哲理派诗人惯用的几个意象,不晦涩,不故弄玄虚,不生僻难解,可以说是完全区别于朦胧诗的特点,也是汪国真的诗歌取得成功之原因所在.";
    	   
    	   //第一种方式
    	   WordCount word = new WordCount();
    	   HashMap<String, Integer> hash = word.getGarticiples(str);
    	   System.out.println("第一种方式：");
    	   System.out.println(hash);
    	   
    	   //第二种方式
    	   System.out.println("第二种方式：");
    	   WordDegrees degre = new WordDegrees();
    	   String result = degre.WordProcessing(str);
    	   System.out.println(result); 
    	}
    
    }

截图：

![Center][]

总结：主要是借鉴别人的开源项目，自己添加的一个功能。

[Center]: /images/20220610/979163257ee242da9b5c87ef9b1b9242.png