利用文本挖掘技术来找出网络中的“小鲜词”

发布时间：2021-01-12 10:53:13 所属栏目：大数据来源：网络整理

导读：副标题#e# 开始之前，先看一下从人人网中发现的90后用户爱用的词是不是很好玩，哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词，这样就知道现在的年轻人喜欢什么了（对于博主这种上了年纪的人来说，真的是很有用，呜呜）项目结构当然，text.da

这里写图片描述

EntropyJudger.java

计算熵值

package grid.text.evolution;

import grid.common.CountMap;
import grid.common.TextUtils;
import grid.text.index.Pos;
import grid.text.index.TextIndexer;


public class EntropyJudger {

    private TextIndexer indexer;

    /** * A word least appeared count */
    private static int LEAST_COUNT_THRESHOLD = 5;   //阈值

    /** * Threshold for solid rate calculated by word appeared count and every * single letter. * * The smaller this values is,more new words you will get,but with less * accuracy. The greater this value is,less new words you will get,but * with high accuracy. */
    private static double SOLID_RATE_THRESHOLD = 0.018;

    /** * Threshold for entropy value calculated by candidate word prefix character * count and suffix character count * * The smaller this values is,but * with high accuracy. */
    private static double ENTROPY_THRESHOL = 1.92;

    public EntropyJudger(TextIndexer indexer) {
        this.indexer = indexer;
    }

    public boolean judge(String candidate) {
        double solidRate = getSolidRate(candidate);

        if (solidRate < SOLID_RATE_THRESHOLD) {
            return false;
        }

        double entropy = getEntropy(candidate);

        if (entropy < ENTROPY_THRESHOL) {
            return false;
        }
        return true;
    }

    private double getEntropy(String candidate) {
        Pos pos = new Pos(candidate);
        CountMap<Character> frontCountMap = new CountMap<Character>();
        CountMap<Character> backCountMap = new CountMap<Character>();
        final int candidateLen = candidate.length();
        int off = 0;
        char c;
        double rate,frontEntropy = 0,backEntropy = 0;

        while (indexer.find(pos).isFound()) {
            off = pos.getPos();

            c = indexer.charAt(off - 1);
            if (TextUtils.isCnLetter(c)) {
                frontCountMap.increase(c);
            }
            c = indexer.charAt(off + candidateLen);
            if (TextUtils.isCnLetter(c)) {
                backCountMap.increase(c);
            }

        }

        for (char key : frontCountMap.keySet()) {
            rate = (double) frontCountMap.get(key) / frontCountMap.count();
            frontEntropy -= rate * Math.log(rate);
        }
        for (char key : backCountMap.keySet()) {
            rate = (double) backCountMap.get(key) / backCountMap.count();
            backEntropy -= rate * Math.log(rate);
        }

        return frontEntropy > backEntropy ? backEntropy : frontEntropy;

    }

    /** * @param candidate * @return */
    public double getSolidRate(String candidate) {

        final int candidateLen = candidate.length();

        if (candidateLen < 2) {
            return 1;
        }

        final int count = indexer.count(candidate);
        double rate = 1;

        if (count < LEAST_COUNT_THRESHOLD) {
            return 0;
        }

        for (int i = 0; i < candidateLen; i++) {
            rate *= (double) count / indexer.count("" + candidate.charAt(i));
        }

        return Math.pow(rate,1D / candidateLen) * Math.sqrt(candidateLen);
    }

    public void setIndexer(TextIndexer indexer) {
        this.indexer = indexer;
    }

}

NewWordDiscover.java

抽词程序

package grid.text.evolution;

import grid.common.TextUtils;
import grid.text.dic.CnDictionary;
import grid.text.index.CnPreviewTextIndexer;
import grid.text.index.TextIndexer;
import grid.text.selector.CnTextSelector;
import grid.text.selector.TextSelector;

import java.util.HashSet;
import java.util.Set;


public class NewWordDiscover {

    private CnDictionary dictionary;

    /** * Minimum word length */
    private final static int MIN_CANDIDATE_LEN = 2;

    /** * Maximum word length */
    private final static int MAX_CANDIDATE_LEN = 6;

    private static Set<Character> structuralLetterSet = new HashSet<Character>();

    private static char[] structuralLetters = { '我','你','您','他','她','谁','哪','那','这','的','了','着','也','是','有','不','在','与','呢','啊','呀','吧','嗯','哦','哈','呐' };

    static {
        for (char c : structuralLetters) {
            structuralLetterSet.add(c);
        }
    }

    public NewWordDiscover() {
        dictionary = CnDictionary.Instance();
    }

    /** * New word discover is based on statistic and entropy,better to sure * document size is in 100kb level,or you may get a unsatisfied result. * * @param document * @return */
    public Set<String> discover(String document) {

        Set<String> set = new HashSet<String>();
        TextIndexer indexer = new CnPreviewTextIndexer(document);
        TextSelector selector = new CnTextSelector(document,MIN_CANDIDATE_LEN,MAX_CANDIDATE_LEN);
        EntropyJudger judger = new EntropyJudger(indexer);
        String candidate;
        while (!selector.end()) {
            candidate = selector.next();
            if (TextUtils.isBlank(candidate)) {
                continue;
            }
            if (structuralLetterSet.contains(candidate.charAt(0))
                    || structuralLetterSet.contains(candidate.charAt(candidate
                            .length() - 1))) {
                continue;
            }
            // Replace IF clause with "set.contains(candidate)" if you want to
            // find new word without any dictionary
            if (dictionary.contains(candidate) || set.contains(candidate)) {
                selector.select();
            } else if (judger.judge(candidate)) {
                set.add(candidate);
            }
        }
        return set;
    }
}

index

这里写图片描述

这几个类用于给词创建索引，方便从词典中找出

CnPreviewTextIndexer.java

package grid.text.index;

import grid.common.TextUtils;

import java.util.HashMap;
import java.util.Map;
import java.util.Vector;

public class CnPreviewTextIndexer implements TextIndexer {

    private final static int CN_LETTER_COUNT = 5021;

    private String document;

    private Map<Character,Vector<Integer>> posMap;

    public CnPreviewTextIndexer(String document) {
        this.document = document;
        init();
    }

    private void init() {
        final int len = document.length();

        final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT
                + 1);

        char c;

        Vector<Integer> posVector;

        posMap = new HashMap<Character,Vector<Integer>>(CN_LETTER_COUNT);

        for (int i = 0; i < len; i++) {
            c = document.charAt(i);
            if (!TextUtils.isCnLetter(c)) {
                continue;
            }
            posVector = posMap.get(c);
            if (null == posVector) {
                posVector = new Vector<Integer>(supposedMinCount);
                posMap.put(c,posVector);
            }
            posVector.add(i);
        }
    }

    @Override
    public int count(String text) {

        if (TextUtils.isBlank(text)) {
            return 0;
        }

        Vector<Integer> vector = posMap.get(text.charAt(0));

        if (null == vector) {
            return 0;
        }

        if (1 == text.length()) {
            return vector.size();
        }

        final int size = vector.size();
        int count = 0;

        for (int i = 0; i < size; i++) {
            if (TextUtils.match(document,vector.get(i),text)) {
                count++;
            }
        }

        return count;
    }

    @Override
    public Pos find(Pos pos) {
        String text = pos.getTarget();

        pos.setFound(false);

        if (TextUtils.isBlank(text)) {
            return pos;
        }

        Vector<Integer> vector = posMap.get(text.charAt(0));

        if (null == vector) {
            return pos;
        }

        final int arraySize = vector.size();
        final int arrayIndex = pos.arrayIndex + 1;

        for (int i = arrayIndex; i < arraySize; i++) {
            if (TextUtils.match(document,text)) {
                pos.setFound(true);
                pos.setPos(vector.get(i));
                pos.arrayIndex = i;
                break;
            }
        }

        return pos;
    }

    @Override
    public int len() {
        return document.length();
    }

    @Override
    public String sub(int off,int len) {
        if (off < 0 || off + len >= document.length()) {
            return "";
        }
        return document.substring(off,off + len);
    }

    @Override
    public char charAt(int index) {
        if (index < 0 || index >= document.length()) {
            return 0;
        }
        return document.charAt(index);
    }
}

Pos.java

package grid.text.index;


public class Pos {
    private String target;

    /** * Pos for current matched full target text */
    private int pos = -1;

    /** * Index in position array for current matched full target text */
    int arrayIndex = -1;

    private boolean found = false;

    public Pos(String target) {
        this.target = target;
    }

    public String getTarget() {
        return target;
    }

    public int getPos() {
        return pos;
    }

    public boolean isFound() {
        return found;
    }

    void setPos(int pos) {
        this.pos = pos;
    }

    void setFound(boolean found) {
        this.found = found;
    }
}

SimpleTextIndexer.java

package grid.text.index;


public class SimpleTextIndexer implements TextIndexer {

    private String document;

    public SimpleTextIndexer(String document) {
        this.document = document;
    }

    @Override
    public int count(String text) {
        int off = 0;
        int count = 0;
        final int len = text.length();
        while ((off = document.indexOf(text,off)) > -1) {
            count++;
            off += len;
        }
        return count;
    }

    @Override
    public Pos find(Pos pos) {
        final String text = pos.getTarget();
        final int len = text.length();
        int off = pos.getPos() + len;
        if (pos.getPos() < 0)
            off = 0;

        pos.setFound(false);

        if ((off = document.indexOf(text,off)) > -1) {
            pos.setFound(true);
            pos.setPos(off);
        }
        return pos;
    }

    @Override
    public int len() {
        return document.length();
    }

    @Override
    public String sub(int off,int len) {
        return document.substring(off,off + len);
    }

    @Override
    public char charAt(int index) {
        if (index < 0 || index >= document.length()) {
            return 0;
        }
        return document.charAt(index);
    }
}

TextIndexer.java

package grid.text.index;


public interface TextIndexer {

    /** * @param text * @return count for specific text */
    public int count(String text);

    /** * @param pos * @return next position for current pos */
    public Pos find(Pos pos);

    /** * @return original document length */
    public int len();

    /** * @param off * @param len * @return the sub string start from <b>off</b> and with a length with * <b>len</b> */
    public String sub(int off,int len);

    /** * @param index * @return return the character in the specified index */
    public char charAt(int index);
}

participle

（编辑：西安站长网）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!

3/3

首页