利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-12 10:53:13 所属栏目:大数据 来源:网络整理
导读:副标题#e# 开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.da
|
副标题[/!--empirenews.page--]
开始之前,先看一下从人人网中发现的90后用户爱用的词
是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构
当然,text.dat和common.dic这两个文件你可以随意替换,注意text.dat中的数据一定要够份量,否则没啥效果 原理么,看下Matrix67大牛的文章你就懂了 互联网时代的社会语言学:基于SNS的文本数据挖掘 训练数据下载 下边开始上代码 common这个里边包含以下几个类,主要是定义数据结构
CountMap.java定义一个计数Map来进行数据操作和持久化 package grid.common;
import java.io.Serializable;
import java.util.HashMap;
public class CountMap<T> extends HashMap<T,Integer> implements Serializable {
private static final long serialVersionUID = 6097963798841161750L;
public void increase(T t) {//添加元素
Integer count = get(t);
if (null == count) {
put(t,1);
} else {
put(t,++count);
}
}
public int count() { //计数
int count = 0;
for (T t : keySet()) {
count += get(t);
}
return count;
}
public int get(char c) {
Integer count = super.get(c);
return null == count ? 0 : count;
}
}
Node.java定义语法树的节点 package grid.common;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class Node<T> {
protected List<Node<T>> children;
protected Node<T> parent;
protected T value;
Node(T value) {
this.value = value;
}
public Node<T> add(T value) {
if (null == children) {
children = new ArrayList<Node<T>>();
}
Node<T> child = new Node<T>(value);
child.setParent(this);
children.add(child);
return child;
}
public T getValue() {
return value;
}
public Node<T> getParent() {
return parent;
}
public void setParent(Node<T> parent) {
this.parent = parent;
}
private void recurseChildren(List<Node<T>> list,Node<T> parent) {
if (null == parent.children) {
list.add(parent);
} else {
for (Node<T> node : parent.children) {
recurseChildren(list,node);
}
}
}
public List<Node<T>> getLeaves() {
List<Node<T>> list = new ArrayList<Node<T>>();
recurseChildren(list,this);
return list;
}
public List<T> getBranchPath() {
List<T> list = new ArrayList<T>();
Node<T> node = this;
do {
list.add(node.getValue());
node = node.parent;
} while (null != node && !(node instanceof Tree<?>));
Collections.reverse(list);
return list;
}
private void append(StringBuilder builder,int deep,Node<T> node) {
for (int i = 0; i < deep; i++) {
builder.append(" ");
}
builder.append("|--");
builder.append(node.getValue());
builder.append("n");
if (null != node.children) {
for (Node<T> child : node.children) {
append(builder,deep + 1,child);
}
}
}
public String dump() {
StringBuilder builder = new StringBuilder();
append(builder,0,this);
return builder.toString();
}
public String toString() {
return value.toString();
}
}
TextDatReader.java读取训练数据 package grid.common;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
public class TextDatReader {
public static String read(String path) throws IOException {
File file = new File(path);
FileReader reader = new FileReader(file);
char buffer[] = new char[(int) file.length()];
reader.read(buffer);
return new String(buffer);
}
}
TextUtils.java用来做文本处理,如判断是否为空、匹配字符等 package grid.common;
public class TextUtils {
public static boolean isCnLetter(char c) {
return c >= 0x4E00 && c <= 0x9FCB;
}
public static boolean isNumeric(char c) {
return c >= '0' && c <= '9';
}
public static boolean isEnLetter(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
public static boolean match(String src,int off,String dest) {
int len = dest.length();
int srcLen = src.length();
for (int i = 0; i < len; i++) {
if (srcLen <= off + i) {
return false;
}
if (dest.charAt(i) != src.charAt(off + i)) {
return false;
}
}
return true;
}
public static boolean isBlank(String str) {
return null == str || str.isEmpty() || str.trim().isEmpty();
}
}
Tree.java语法树 package grid.common;
public class Tree<T> extends Node<T> {
public Tree(T value) {
super(value);
}
}
dic里边包含CnDictionary类
CnDictionary.java词典处理 package grid.text.dic;
import grid.common.CountMap;
import grid.common.TextDatReader;
import grid.common.TextUtils;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
public class CnDictionary {
private final String COMMON_WORD_DIC_PATH = "common.dic";
/** * This text data is for character statistic. Change to your own if you * like. */
private final String COMMON_LETTER_RESOURCE_PATH = "text.dat";
private Set<String> dictionary = new HashSet<String>();
private CountMap<Character> letterCountMap = new CountMap<Character>();
private int totalLetterCount;
private static CnDictionary instance;
//单例模式
public static CnDictionary Instance() {
if (null == instance) {
try {
instance = new CnDictionary();
} catch (IOException e) {
e.printStackTrace();
}
}
return instance;
}
private CnDictionary() throws IOException {
initWordDic();
initLetterCountMap();
}
private void initLetterCountMap() throws IOException {
String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat
final int len = letterResource.length();
char c;
for (int i = 0; i < len; i++) {
c = letterResource.charAt(i);
if (TextUtils.isCnLetter(c)) {
letterCountMap.increase(c);
}
}
totalLetterCount = letterCountMap.count();
}
private void initWordDic() throws IOException {
String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic
final int len = bytes.length();
String s = "";
char c;
for (int i = 0; i < len; i++) {
c = bytes.charAt(i);
if ('n' == c || 'r' == c || 0 == c) {
if (!TextUtils.isBlank(s)) {
dictionary.add(s.trim());
}
s = "";
} else {
s += c;
}
if (0 == c) {
break;
}
}
}
public boolean contains(String word) {
return dictionary.contains(word);
}
public double rate(char c) {
return (double) letterCountMap.get(c) / totalLetterCount;
}
public int size() {
return dictionary.size();
}
}
evolution(编辑:西安站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |





