利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-12 10:53:13 所属栏目:大数据 来源:网络整理
导读:副标题#e# 开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.da
EntropyJudger.java计算熵值 package grid.text.evolution; import grid.common.CountMap; import grid.common.TextUtils; import grid.text.index.Pos; import grid.text.index.TextIndexer; public class EntropyJudger { private TextIndexer indexer; /** * A word least appeared count */ private static int LEAST_COUNT_THRESHOLD = 5; //阈值 /** * Threshold for solid rate calculated by word appeared count and every * single letter. * * The smaller this values is,more new words you will get,but with less * accuracy. The greater this value is,less new words you will get,but * with high accuracy. */ private static double SOLID_RATE_THRESHOLD = 0.018; /** * Threshold for entropy value calculated by candidate word prefix character * count and suffix character count * * The smaller this values is,but * with high accuracy. */ private static double ENTROPY_THRESHOL = 1.92; public EntropyJudger(TextIndexer indexer) { this.indexer = indexer; } public boolean judge(String candidate) { double solidRate = getSolidRate(candidate); if (solidRate < SOLID_RATE_THRESHOLD) { return false; } double entropy = getEntropy(candidate); if (entropy < ENTROPY_THRESHOL) { return false; } return true; } private double getEntropy(String candidate) { Pos pos = new Pos(candidate); CountMap<Character> frontCountMap = new CountMap<Character>(); CountMap<Character> backCountMap = new CountMap<Character>(); final int candidateLen = candidate.length(); int off = 0; char c; double rate,frontEntropy = 0,backEntropy = 0; while (indexer.find(pos).isFound()) { off = pos.getPos(); c = indexer.charAt(off - 1); if (TextUtils.isCnLetter(c)) { frontCountMap.increase(c); } c = indexer.charAt(off + candidateLen); if (TextUtils.isCnLetter(c)) { backCountMap.increase(c); } } for (char key : frontCountMap.keySet()) { rate = (double) frontCountMap.get(key) / frontCountMap.count(); frontEntropy -= rate * Math.log(rate); } for (char key : backCountMap.keySet()) { rate = (double) backCountMap.get(key) / backCountMap.count(); backEntropy -= rate * Math.log(rate); } return frontEntropy > backEntropy ? backEntropy : frontEntropy; } /** * @param candidate * @return */ public double getSolidRate(String candidate) { final int candidateLen = candidate.length(); if (candidateLen < 2) { return 1; } final int count = indexer.count(candidate); double rate = 1; if (count < LEAST_COUNT_THRESHOLD) { return 0; } for (int i = 0; i < candidateLen; i++) { rate *= (double) count / indexer.count("" + candidate.charAt(i)); } return Math.pow(rate,1D / candidateLen) * Math.sqrt(candidateLen); } public void setIndexer(TextIndexer indexer) { this.indexer = indexer; } } NewWordDiscover.java抽词程序 package grid.text.evolution; import grid.common.TextUtils; import grid.text.dic.CnDictionary; import grid.text.index.CnPreviewTextIndexer; import grid.text.index.TextIndexer; import grid.text.selector.CnTextSelector; import grid.text.selector.TextSelector; import java.util.HashSet; import java.util.Set; public class NewWordDiscover { private CnDictionary dictionary; /** * Minimum word length */ private final static int MIN_CANDIDATE_LEN = 2; /** * Maximum word length */ private final static int MAX_CANDIDATE_LEN = 6; private static Set<Character> structuralLetterSet = new HashSet<Character>(); private static char[] structuralLetters = { '我','你','您','他','她','谁','哪','那','这','的','了','着','也','是','有','不','在','与','呢','啊','呀','吧','嗯','哦','哈','呐' }; static { for (char c : structuralLetters) { structuralLetterSet.add(c); } } public NewWordDiscover() { dictionary = CnDictionary.Instance(); } /** * New word discover is based on statistic and entropy,better to sure * document size is in 100kb level,or you may get a unsatisfied result. * * @param document * @return */ public Set<String> discover(String document) { Set<String> set = new HashSet<String>(); TextIndexer indexer = new CnPreviewTextIndexer(document); TextSelector selector = new CnTextSelector(document,MIN_CANDIDATE_LEN,MAX_CANDIDATE_LEN); EntropyJudger judger = new EntropyJudger(indexer); String candidate; while (!selector.end()) { candidate = selector.next(); if (TextUtils.isBlank(candidate)) { continue; } if (structuralLetterSet.contains(candidate.charAt(0)) || structuralLetterSet.contains(candidate.charAt(candidate .length() - 1))) { continue; } // Replace IF clause with "set.contains(candidate)" if you want to // find new word without any dictionary if (dictionary.contains(candidate) || set.contains(candidate)) { selector.select(); } else if (judger.judge(candidate)) { set.add(candidate); } } return set; } } index这几个类用于给词创建索引,方便从词典中找出 CnPreviewTextIndexer.javapackage grid.text.index; import grid.common.TextUtils; import java.util.HashMap; import java.util.Map; import java.util.Vector; public class CnPreviewTextIndexer implements TextIndexer { private final static int CN_LETTER_COUNT = 5021; private String document; private Map<Character,Vector<Integer>> posMap; public CnPreviewTextIndexer(String document) { this.document = document; init(); } private void init() { final int len = document.length(); final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT + 1); char c; Vector<Integer> posVector; posMap = new HashMap<Character,Vector<Integer>>(CN_LETTER_COUNT); for (int i = 0; i < len; i++) { c = document.charAt(i); if (!TextUtils.isCnLetter(c)) { continue; } posVector = posMap.get(c); if (null == posVector) { posVector = new Vector<Integer>(supposedMinCount); posMap.put(c,posVector); } posVector.add(i); } } @Override public int count(String text) { if (TextUtils.isBlank(text)) { return 0; } Vector<Integer> vector = posMap.get(text.charAt(0)); if (null == vector) { return 0; } if (1 == text.length()) { return vector.size(); } final int size = vector.size(); int count = 0; for (int i = 0; i < size; i++) { if (TextUtils.match(document,vector.get(i),text)) { count++; } } return count; } @Override public Pos find(Pos pos) { String text = pos.getTarget(); pos.setFound(false); if (TextUtils.isBlank(text)) { return pos; } Vector<Integer> vector = posMap.get(text.charAt(0)); if (null == vector) { return pos; } final int arraySize = vector.size(); final int arrayIndex = pos.arrayIndex + 1; for (int i = arrayIndex; i < arraySize; i++) { if (TextUtils.match(document,text)) { pos.setFound(true); pos.setPos(vector.get(i)); pos.arrayIndex = i; break; } } return pos; } @Override public int len() { return document.length(); } @Override public String sub(int off,int len) { if (off < 0 || off + len >= document.length()) { return ""; } return document.substring(off,off + len); } @Override public char charAt(int index) { if (index < 0 || index >= document.length()) { return 0; } return document.charAt(index); } } Pos.javapackage grid.text.index; public class Pos { private String target; /** * Pos for current matched full target text */ private int pos = -1; /** * Index in position array for current matched full target text */ int arrayIndex = -1; private boolean found = false; public Pos(String target) { this.target = target; } public String getTarget() { return target; } public int getPos() { return pos; } public boolean isFound() { return found; } void setPos(int pos) { this.pos = pos; } void setFound(boolean found) { this.found = found; } } SimpleTextIndexer.javapackage grid.text.index; public class SimpleTextIndexer implements TextIndexer { private String document; public SimpleTextIndexer(String document) { this.document = document; } @Override public int count(String text) { int off = 0; int count = 0; final int len = text.length(); while ((off = document.indexOf(text,off)) > -1) { count++; off += len; } return count; } @Override public Pos find(Pos pos) { final String text = pos.getTarget(); final int len = text.length(); int off = pos.getPos() + len; if (pos.getPos() < 0) off = 0; pos.setFound(false); if ((off = document.indexOf(text,off)) > -1) { pos.setFound(true); pos.setPos(off); } return pos; } @Override public int len() { return document.length(); } @Override public String sub(int off,int len) { return document.substring(off,off + len); } @Override public char charAt(int index) { if (index < 0 || index >= document.length()) { return 0; } return document.charAt(index); } } TextIndexer.javapackage grid.text.index; public interface TextIndexer { /** * @param text * @return count for specific text */ public int count(String text); /** * @param pos * @return next position for current pos */ public Pos find(Pos pos); /** * @return original document length */ public int len(); /** * @param off * @param len * @return the sub string start from <b>off</b> and with a length with * <b>len</b> */ public String sub(int off,int len); /** * @param index * @return return the character in the specified index */ public char charAt(int index); } participle(编辑:西安站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |