利用文本挖掘技术来找出网络中的“小鲜词”

发布时间：2021-01-12 10:53:13 所属栏目：大数据来源：网络整理

导读：副标题#e# 开始之前，先看一下从人人网中发现的90后用户爱用的词是不是很好玩，哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词，这样就知道现在的年轻人喜欢什么了（对于博主这种上了年纪的人来说，真的是很有用，呜呜）项目结构当然，text.da

副标题[/!--empirenews.page--]

开始之前，先看一下从人人网中发现的90后用户爱用的词

这里写图片描述

是不是很好玩，哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词，这样就知道现在的年轻人喜欢什么了（对于博主这种上了年纪的人来说，真的是很有用，呜呜）

项目结构

利用文本挖掘技术来找出网络中的“小鲜词”

当然，text.dat和common.dic这两个文件你可以随意替换，注意text.dat中的数据一定要够份量，否则没啥效果

原理么，看下Matrix67大牛的文章你就懂了

互联网时代的社会语言学：基于SNS的文本数据挖掘

训练数据下载

下边开始上代码

common

这个里边包含以下几个类，主要是定义数据结构

这里写图片描述

CountMap.java

定义一个计数Map来进行数据操作和持久化

package grid.common;

import java.io.Serializable;
import java.util.HashMap;


public class CountMap<T> extends HashMap<T,Integer> implements Serializable {

    private static final long serialVersionUID = 6097963798841161750L;

    public void increase(T t) {//添加元素
        Integer count = get(t);
        if (null == count) {
            put(t,1);
        } else {
            put(t,++count);
        }
    }

    public int count() {   //计数
        int count = 0;
        for (T t : keySet()) {
            count += get(t);
        }
        return count;
    }

    public int get(char c) {
        Integer count = super.get(c);
        return null == count ? 0 : count;
    }
}

Node.java

定义语法树的节点

package grid.common;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class Node<T> {
    protected List<Node<T>> children;

    protected Node<T> parent;

    protected T value;

    Node(T value) {
        this.value = value;
    }

    public Node<T> add(T value) {
        if (null == children) {
            children = new ArrayList<Node<T>>();
        }
        Node<T> child = new Node<T>(value);
        child.setParent(this);
        children.add(child);
        return child;
    }

    public T getValue() {
        return value;
    }

    public Node<T> getParent() {
        return parent;
    }

    public void setParent(Node<T> parent) {
        this.parent = parent;
    }

    private void recurseChildren(List<Node<T>> list,Node<T> parent) {
        if (null == parent.children) {
            list.add(parent);
        } else {
            for (Node<T> node : parent.children) {
                recurseChildren(list,node);
            }
        }
    }

    public List<Node<T>> getLeaves() {
        List<Node<T>> list = new ArrayList<Node<T>>();
        recurseChildren(list,this);
        return list;

    }

    public List<T> getBranchPath() {
        List<T> list = new ArrayList<T>();
        Node<T> node = this;
        do {
            list.add(node.getValue());
            node = node.parent;
        } while (null != node && !(node instanceof Tree<?>));
        Collections.reverse(list);
        return list;
    }

    private void append(StringBuilder builder,int deep,Node<T> node) {
        for (int i = 0; i < deep; i++) {
            builder.append(" ");
        }
        builder.append("|--");
        builder.append(node.getValue());
        builder.append("n");
        if (null != node.children) {
            for (Node<T> child : node.children) {
                append(builder,deep + 1,child);
            }
        }
    }

    public String dump() {
        StringBuilder builder = new StringBuilder();
        append(builder,0,this);
        return builder.toString();
    }

    public String toString() {
        return value.toString();
    }
}

TextDatReader.java

读取训练数据

package grid.common;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;


public class TextDatReader {
    public static String read(String path) throws IOException {
        File file = new File(path);
        FileReader reader = new FileReader(file);
        char buffer[] = new char[(int) file.length()];
        reader.read(buffer);
        return new String(buffer);
    }
}

TextUtils.java

用来做文本处理，如判断是否为空、匹配字符等

package grid.common;


public class TextUtils {

    public static boolean isCnLetter(char c) {
        return c >= 0x4E00 && c <= 0x9FCB;
    }

    public static boolean isNumeric(char c) {
        return c >= '0' && c <= '9';
    }

    public static boolean isEnLetter(char c) {
        return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
    }

    public static boolean match(String src,int off,String dest) {
        int len = dest.length();
        int srcLen = src.length();
        for (int i = 0; i < len; i++) {
            if (srcLen <= off + i) {
                return false;
            }
            if (dest.charAt(i) != src.charAt(off + i)) {
                return false;
            }
        }
        return true;
    }

    public static boolean isBlank(String str) {
        return null == str || str.isEmpty() || str.trim().isEmpty();
    }
}

Tree.java

语法树

package grid.common;


public class Tree<T> extends Node<T> {

    public Tree(T value) {
        super(value);
    }

}

dic

里边包含CnDictionary类

这里写图片描述

CnDictionary.java

词典处理

package grid.text.dic;

import grid.common.CountMap;
import grid.common.TextDatReader;
import grid.common.TextUtils;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;


public class CnDictionary {

    private final String COMMON_WORD_DIC_PATH = "common.dic";

    /** * This text data is for character statistic. Change to your own if you * like. */
    private final String COMMON_LETTER_RESOURCE_PATH = "text.dat";

    private Set<String> dictionary = new HashSet<String>();

    private CountMap<Character> letterCountMap = new CountMap<Character>();

    private int totalLetterCount;

    private static CnDictionary instance;
//单例模式
    public static CnDictionary Instance() {
        if (null == instance) {
            try {
                instance = new CnDictionary();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return instance;
    }

    private CnDictionary() throws IOException {
        initWordDic();
        initLetterCountMap();
    }

    private void initLetterCountMap() throws IOException {
        String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat
        final int len = letterResource.length();
        char c;
        for (int i = 0; i < len; i++) {
            c = letterResource.charAt(i);
            if (TextUtils.isCnLetter(c)) {
                letterCountMap.increase(c);
            }
        }
        totalLetterCount = letterCountMap.count();

    }

    private void initWordDic() throws IOException {

        String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic
        final int len = bytes.length();
        String s = "";
        char c;
        for (int i = 0; i < len; i++) {
            c = bytes.charAt(i);

            if ('n' == c || 'r' == c || 0 == c) {
                if (!TextUtils.isBlank(s)) {
                    dictionary.add(s.trim());
                }
                s = "";
            } else {
                s += c;
            }
            if (0 == c) {
                break;
            }
        }
    }

    public boolean contains(String word) {
        return dictionary.contains(word);
    }

    public double rate(char c) {
        return (double) letterCountMap.get(c) / totalLetterCount;
    }

    public int size() {
        return dictionary.size();
    }
}

evolution

（编辑：西安站长网）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!

1/3

尾页