pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gensim
# -*- coding: utf-8 -*-
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
e:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
2017-12-25 14:51:28,440 : INFO : 'pattern' package not found; tag filters are not available for English
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
import jieba
content = """面对当前挑战,我们应该落实2030年可持续发展议程,促进包容性发展"""
content = list(jieba.cut(content, cut_all=False))
for word in content:
print(word)
Building prefix dict from the default dictionary ...
2017-12-25 14:51:32,170 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\THREEP~1\AppData\Local\Temp\jieba.cache
2017-12-25 14:51:32,178 : DEBUG : Loading model from cache C:\Users\THREEP~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.710 seconds.
2017-12-25 14:51:32,881 : DEBUG : Loading model cost 0.710 seconds.
Prefix dict has been built succesfully.
2017-12-25 14:51:32,883 : DEBUG : Prefix dict has been built succesfully.
面对
当前
挑战
,
我们
应该
落实
2030
年
可
持续
发展
议程
,
促进
包容性
发展
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (15.0, 8.0)
% matplotlib inline
import numpy as np
import pandas as pd
from scipy.misc import imread
from wordcloud import WordCloud
name = "docs/assets/择天记.txt"
with open(name, encoding="utf-8") as fp:
book = fp.read()
with open(name, encoding="utf-8") as fp:
lines = [line for line in fp.readlines() if len(line) > 2]
print(book[:100])
print(lines[:10])
择天记
猫腻
玄幻奇幻
太始元年,有神石自太空飞来,分散落在人间,其中落在东土大陆的神石,上面镌刻着奇怪的图腾,人因观其图腾而悟道,后立国教。
数千年后,十四岁的少年孤儿陈长生,为治病改命离开自
['择天记\n', '猫腻\n', '玄幻奇幻\n', '太始元年,有神石自太空飞来,分散落在人间,其中落在东土大陆的神石,上面镌刻着奇怪的图腾,人因观其图腾而悟道,后立国教。\n', '数千年后,十四岁的少年孤儿陈长生,为治病改命离开自己的师父,带着一纸婚约来到神都,从而开启了一个逆天强者的崛起征程。\n', '各位书友要是觉得《择天记》还不错的话请不要忘记向您QQ群和微博里的朋友推荐哦!\n', '序 下山\n', '世界是相对的。\n', '中土大6隔着海洋与大西洲遥遥相对。东方地势较高,那里的天空似乎也高了起来,云雾从海上6地上升腾而起,不停向着那处飘去,最终汇聚在一起,终年不散。\n', '这里便是云墓——世间所有云的坟墓。\n']
import jieba
jieba.add_word("陈长生",tag="nz")
jieba.add_word("徐有容",tag="nz")
jieba.add_word("落落",tag="nz")
jieba.add_word("小黑龙",tag="nz")
segments = [seg for seg in jieba.cut(book) if len(seg) > 1]
df = pd.DataFrame({'segment': segments})
df
segment | |
---|---|
0 | 择天记 |
1 | 猫腻 |
2 | 玄幻 |
3 | 奇幻 |
4 | 太始 |
5 | 元年 |
6 | 有神 |
7 | 石自 |
8 | 太空 |
9 | 飞来 |
10 | 分散 |
11 | 人间 |
12 | 其中 |
13 | 东土 |
14 | 大陆 |
15 | 神石 |
16 | 上面 |
17 | 镌刻 |
18 | 奇怪 |
19 | 图腾 |
20 | 因观 |
21 | 图腾 |
22 | 悟道 |
23 | 立国 |
24 | 数千年 |
25 | 十四岁 |
26 | 少年 |
27 | 孤儿 |
28 | 陈长生 |
29 | 治病 |
... | ... |
1024912 | 常见 |
1024913 | 宫廷 |
1024914 | 故事 |
1024915 | 陈长生 |
1024916 | 说道 |
1024917 | 我要 |
1024918 | 圣城 |
1024919 | 我们 |
1024920 | 可能 |
1024921 | 顺路 |
1024922 | 铁面人 |
1024923 | 焦急 |
1024924 | 说道 |
1024925 | 一定 |
1024926 | 顺路 |
1024927 | 一定 |
1024928 | 顺路 |
1024929 | 就算 |
1024930 | 地狱 |
1024931 | 毫不犹豫 |
1024932 | 跟随 |
1024933 | 脚步 |
1024934 | 陈长生 |
1024935 | 说道 |
1024936 | 如果 |
1024937 | 我要 |
1024938 | 地方 |
1024939 | 神国 |
1024940 | 全文 |
1024941 | 本章 |
1024942 rows × 1 columns
stopwords = pd.read_csv(u"docs/assets/stop_words.txt")
df = df[~df.segment.isin(stopwords.stopword)]
df
segment | |
---|---|
0 | 择天记 |
1 | 猫腻 |
2 | 玄幻 |
3 | 奇幻 |
4 | 太始 |
5 | 元年 |
6 | 有神 |
7 | 石自 |
8 | 太空 |
9 | 飞来 |
10 | 分散 |
11 | 人间 |
13 | 东土 |
14 | 大陆 |
15 | 神石 |
16 | 上面 |
17 | 镌刻 |
18 | 奇怪 |
19 | 图腾 |
20 | 因观 |
21 | 图腾 |
22 | 悟道 |
23 | 立国 |
24 | 数千年 |
25 | 十四岁 |
26 | 少年 |
27 | 孤儿 |
28 | 陈长生 |
29 | 治病 |
30 | 改命 |
... | ... |
1024910 | 推演出来 |
1024911 | 一个 |
1024912 | 常见 |
1024913 | 宫廷 |
1024914 | 故事 |
1024915 | 陈长生 |
1024916 | 说道 |
1024917 | 我要 |
1024918 | 圣城 |
1024920 | 可能 |
1024921 | 顺路 |
1024922 | 铁面人 |
1024923 | 焦急 |
1024924 | 说道 |
1024925 | 一定 |
1024926 | 顺路 |
1024927 | 一定 |
1024928 | 顺路 |
1024929 | 就算 |
1024930 | 地狱 |
1024931 | 毫不犹豫 |
1024932 | 跟随 |
1024933 | 脚步 |
1024934 | 陈长生 |
1024935 | 说道 |
1024937 | 我要 |
1024938 | 地方 |
1024939 | 神国 |
1024940 | 全文 |
1024941 | 本章 |
905949 rows × 1 columns
segStat = df.groupby(by=["segment"])["segment"].agg({"count": np.size}).reset_index().sort_values(by=["count"], ascending=False);
segStat.head(20)
e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
"""Entry point for launching an IPython kernel.
segment | count | |
---|---|---|
49602 | 陈长生 | 15939 |
31806 | 没有 | 13387 |
44735 | 说道 | 10080 |
36711 | 看着 | 6958 |
37136 | 知道 | 6245 |
19516 | 已经 | 4787 |
14009 | 国教 | 4271 |
4017 | 事情 | 4187 |
2416 | 不是 | 3760 |
1568 | 三十六 | 3686 |
17320 | 学院 | 3575 |
21252 | 徐有容 | 3339 |
21111 | 很多 | 3017 |
35031 | 现在 | 2969 |
543 | 一个 | 2819 |
41362 | 能够 | 2416 |
12033 | 可能 | 2403 |
20163 | 应该 | 2352 |
5529 | 仿佛 | 2170 |
36634 | 看到 | 2137 |
back_coloring = imread(u"docs/assets/mask.jpg")
wordcloud = WordCloud(font_path=u"docs/assets/wqywmh.ttf", background_color="white", mask=back_coloring)
plt.axis("off")
wordcloud = wordcloud.fit_words(dict([(s, g) for s, g in segStat.head(20).itertuples(index=False)]))
plt.imshow(wordcloud)
<matplotlib.image.AxesImage at 0x2c0c23986a0>
import jieba.posseg as pseg
words = pseg.cut(book[:100])
for w in words:
if w.flag == 'nr':
print(w.word, w.flag, w)
玄幻 nr 玄幻/nr
石自 nr 石自/nr
孤儿 nr 孤儿/nr
sentences = []
for line in lines:
words = list(jieba.cut(line))
sentences.append(words)
考虑更加细腻的sentence
import gensim
model = gensim.models.Word2Vec(sentences,size=200,window=5,min_count=5,workers=4)
2017-12-25 15:56:37,001 : INFO : collecting all words and their counts
2017-12-25 15:56:37,003 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-25 15:56:37,065 : INFO : PROGRESS: at sentence #10000, processed 260578 words, keeping 18494 word types
2017-12-25 15:56:37,123 : INFO : PROGRESS: at sentence #20000, processed 542096 words, keeping 27616 word types
2017-12-25 15:56:37,177 : INFO : PROGRESS: at sentence #30000, processed 847726 words, keeping 35161 word types
2017-12-25 15:56:37,232 : INFO : PROGRESS: at sentence #40000, processed 1116154 words, keeping 40354 word types
2017-12-25 15:56:37,275 : INFO : PROGRESS: at sentence #50000, processed 1347282 words, keeping 44113 word types
2017-12-25 15:56:37,325 : INFO : PROGRESS: at sentence #60000, processed 1570782 words, keeping 47817 word types
2017-12-25 15:56:37,376 : INFO : PROGRESS: at sentence #70000, processed 1784533 words, keeping 51083 word types
2017-12-25 15:56:37,415 : INFO : PROGRESS: at sentence #80000, processed 1967748 words, keeping 53288 word types
2017-12-25 15:56:37,450 : INFO : collected 55415 word types from a corpus of 2123096 raw words and 87625 sentences
2017-12-25 15:56:37,451 : INFO : Loading a fresh vocabulary
2017-12-25 15:56:37,599 : INFO : min_count=5 retains 16452 unique words (29% of original 55415, drops 38963)
2017-12-25 15:56:37,600 : INFO : min_count=5 leaves 2060983 word corpus (97% of original 2123096, drops 62113)
2017-12-25 15:56:37,749 : INFO : deleting the raw counts dictionary of 55415 items
2017-12-25 15:56:37,752 : INFO : sample=0.001 downsamples 40 most-common words
2017-12-25 15:56:37,754 : INFO : downsampling leaves estimated 1500692 word corpus (72.8% of prior 2060983)
2017-12-25 15:56:37,755 : INFO : estimated required memory for 16452 words and 200 dimensions: 34549200 bytes
2017-12-25 15:56:37,845 : INFO : resetting layer weights
2017-12-25 15:56:38,082 : INFO : training model with 4 workers on 16452 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-12-25 15:56:39,091 : INFO : PROGRESS: at 13.98% examples, 1130832 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:40,094 : INFO : PROGRESS: at 32.37% examples, 1257027 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:41,099 : INFO : PROGRESS: at 49.64% examples, 1273680 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:42,103 : INFO : PROGRESS: at 68.29% examples, 1303971 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:43,108 : INFO : PROGRESS: at 86.07% examples, 1302467 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:43,882 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-25 15:56:43,884 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-25 15:56:43,891 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-25 15:56:43,898 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-25 15:56:43,899 : INFO : training on 10615480 raw words (7502987 effective words) took 5.8s, 1291613 effective words/s
for k, s in model.most_similar(positive=[u"圣后"]):
print(k, s)
e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
"""Entry point for launching an IPython kernel.
2017-12-25 15:58:02,828 : INFO : precomputing L2-norms of word weight vectors
胜雪 0.806833028793335
皇后 0.7890491485595703
承武 0.7554647326469421
承文 0.7192850708961487
家 0.7164201736450195
沾衣 0.6703976392745972
朝 0.6002320051193237
先帝 0.5968987345695496
白帝 0.5489777326583862
旨意 0.5485547184944153