Gensim

pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gensim

# -*- coding: utf-8 -*-
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora

documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

e:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
2017-12-25 14:51:28,440 : INFO : 'pattern' package not found; tag filters are not available for English

stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]

import jieba

content = """面对当前挑战，我们应该落实2030年可持续发展议程，促进包容性发展"""
content = list(jieba.cut(content, cut_all=False))
for word in content:
    print(word)

Building prefix dict from the default dictionary ...
2017-12-25 14:51:32,170 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\THREEP~1\AppData\Local\Temp\jieba.cache
2017-12-25 14:51:32,178 : DEBUG : Loading model from cache C:\Users\THREEP~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.710 seconds.
2017-12-25 14:51:32,881 : DEBUG : Loading model cost 0.710 seconds.
Prefix dict has been built succesfully.
2017-12-25 14:51:32,883 : DEBUG : Prefix dict has been built succesfully.


面对
当前
挑战
，
我们
应该
落实
2030
年
可
持续
发展
议程
，
促进
包容性
发展

import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (15.0, 8.0)
% matplotlib inline

import numpy as np
import pandas as pd

from scipy.misc import imread
from wordcloud import WordCloud

name = "docs/assets/择天记.txt"
with open(name, encoding="utf-8") as fp:
    book = fp.read()

with open(name, encoding="utf-8") as fp:
    lines = [line for line in fp.readlines() if len(line) > 2]

print(book[:100])
print(lines[:10])

择天记

猫腻

玄幻奇幻

太始元年，有神石自太空飞来，分散落在人间，其中落在东土大陆的神石，上面镌刻着奇怪的图腾，人因观其图腾而悟道，后立国教。
数千年后，十四岁的少年孤儿陈长生，为治病改命离开自
['择天记\n', '猫腻\n', '玄幻奇幻\n', '太始元年，有神石自太空飞来，分散落在人间，其中落在东土大陆的神石，上面镌刻着奇怪的图腾，人因观其图腾而悟道，后立国教。\n', '数千年后，十四岁的少年孤儿陈长生，为治病改命离开自己的师父，带着一纸婚约来到神都，从而开启了一个逆天强者的崛起征程。\n', '各位书友要是觉得《择天记》还不错的话请不要忘记向您QQ群和微博里的朋友推荐哦！\n', '序 下山\n', '世界是相对的。\n', '中土大6隔着海洋与大西洲遥遥相对。东方地势较高，那里的天空似乎也高了起来，云雾从海上6地上升腾而起，不停向着那处飘去，最终汇聚在一起，终年不散。\n', '这里便是云墓——世间所有云的坟墓。\n']

import jieba
jieba.add_word("陈长生",tag="nz")
jieba.add_word("徐有容",tag="nz")
jieba.add_word("落落",tag="nz")
jieba.add_word("小黑龙",tag="nz")

segments = [seg for seg in jieba.cut(book) if len(seg) > 1]
df = pd.DataFrame({'segment': segments})
df

	segment
0	择天记
1	猫腻
2	玄幻
3	奇幻
4	太始
5	元年
6	有神
7	石自
8	太空
9	飞来
10	分散
11	人间
12	其中
13	东土
14	大陆
15	神石
16	上面
17	镌刻
18	奇怪
19	图腾
20	因观
21	图腾
22	悟道
23	立国
24	数千年
25	十四岁
26	少年
27	孤儿
28	陈长生
29	治病
...	...
1024912	常见
1024913	宫廷
1024914	故事
1024915	陈长生
1024916	说道
1024917	我要
1024918	圣城
1024919	我们
1024920	可能
1024921	顺路
1024922	铁面人
1024923	焦急
1024924	说道
1024925	一定
1024926	顺路
1024927	一定
1024928	顺路
1024929	就算
1024930	地狱
1024931	毫不犹豫
1024932	跟随
1024933	脚步
1024934	陈长生
1024935	说道
1024936	如果
1024937	我要
1024938	地方
1024939	神国
1024940	全文
1024941	本章

1024942 rows × 1 columns

stopwords = pd.read_csv(u"docs/assets/stop_words.txt")
df = df[~df.segment.isin(stopwords.stopword)]
df

	segment
0	择天记
1	猫腻
2	玄幻
3	奇幻
4	太始
5	元年
6	有神
7	石自
8	太空
9	飞来
10	分散
11	人间
13	东土
14	大陆
15	神石
16	上面
17	镌刻
18	奇怪
19	图腾
20	因观
21	图腾
22	悟道
23	立国
24	数千年
25	十四岁
26	少年
27	孤儿
28	陈长生
29	治病
30	改命
...	...
1024910	推演出来
1024911	一个
1024912	常见
1024913	宫廷
1024914	故事
1024915	陈长生
1024916	说道
1024917	我要
1024918	圣城
1024920	可能
1024921	顺路
1024922	铁面人
1024923	焦急
1024924	说道
1024925	一定
1024926	顺路
1024927	一定
1024928	顺路
1024929	就算
1024930	地狱
1024931	毫不犹豫
1024932	跟随
1024933	脚步
1024934	陈长生
1024935	说道
1024937	我要
1024938	地方
1024939	神国
1024940	全文
1024941	本章

905949 rows × 1 columns

segStat = df.groupby(by=["segment"])["segment"].agg({"count": np.size}).reset_index().sort_values(by=["count"], ascending=False);
segStat.head(20)

e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.

	segment	count
49602	陈长生	15939
31806	没有	13387
44735	说道	10080
36711	看着	6958
37136	知道	6245
19516	已经	4787
14009	国教	4271
4017	事情	4187
2416	不是	3760
1568	三十六	3686
17320	学院	3575
21252	徐有容	3339
21111	很多	3017
35031	现在	2969
543	一个	2819
41362	能够	2416
12033	可能	2403
20163	应该	2352
5529	仿佛	2170
36634	看到	2137

back_coloring = imread(u"docs/assets/mask.jpg")

wordcloud = WordCloud(font_path=u"docs/assets/wqywmh.ttf", background_color="white", mask=back_coloring)
plt.axis("off")
wordcloud = wordcloud.fit_words(dict([(s, g) for s, g in segStat.head(20).itertuples(index=False)]))

plt.imshow(wordcloud)

<matplotlib.image.AxesImage at 0x2c0c23986a0>

png

import jieba.posseg as pseg

words = pseg.cut(book[:100])
for w in words:
    if w.flag == 'nr':
        print(w.word, w.flag, w)

玄幻 nr 玄幻/nr
石自 nr 石自/nr
孤儿 nr 孤儿/nr

sentences = []
for line in lines:
    words = list(jieba.cut(line))
    sentences.append(words)

考虑更加细腻的sentence

import gensim
model = gensim.models.Word2Vec(sentences,size=200,window=5,min_count=5,workers=4)

2017-12-25 15:56:37,001 : INFO : collecting all words and their counts
2017-12-25 15:56:37,003 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-25 15:56:37,065 : INFO : PROGRESS: at sentence #10000, processed 260578 words, keeping 18494 word types
2017-12-25 15:56:37,123 : INFO : PROGRESS: at sentence #20000, processed 542096 words, keeping 27616 word types
2017-12-25 15:56:37,177 : INFO : PROGRESS: at sentence #30000, processed 847726 words, keeping 35161 word types
2017-12-25 15:56:37,232 : INFO : PROGRESS: at sentence #40000, processed 1116154 words, keeping 40354 word types
2017-12-25 15:56:37,275 : INFO : PROGRESS: at sentence #50000, processed 1347282 words, keeping 44113 word types
2017-12-25 15:56:37,325 : INFO : PROGRESS: at sentence #60000, processed 1570782 words, keeping 47817 word types
2017-12-25 15:56:37,376 : INFO : PROGRESS: at sentence #70000, processed 1784533 words, keeping 51083 word types
2017-12-25 15:56:37,415 : INFO : PROGRESS: at sentence #80000, processed 1967748 words, keeping 53288 word types
2017-12-25 15:56:37,450 : INFO : collected 55415 word types from a corpus of 2123096 raw words and 87625 sentences
2017-12-25 15:56:37,451 : INFO : Loading a fresh vocabulary
2017-12-25 15:56:37,599 : INFO : min_count=5 retains 16452 unique words (29% of original 55415, drops 38963)
2017-12-25 15:56:37,600 : INFO : min_count=5 leaves 2060983 word corpus (97% of original 2123096, drops 62113)
2017-12-25 15:56:37,749 : INFO : deleting the raw counts dictionary of 55415 items
2017-12-25 15:56:37,752 : INFO : sample=0.001 downsamples 40 most-common words
2017-12-25 15:56:37,754 : INFO : downsampling leaves estimated 1500692 word corpus (72.8% of prior 2060983)
2017-12-25 15:56:37,755 : INFO : estimated required memory for 16452 words and 200 dimensions: 34549200 bytes
2017-12-25 15:56:37,845 : INFO : resetting layer weights
2017-12-25 15:56:38,082 : INFO : training model with 4 workers on 16452 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-12-25 15:56:39,091 : INFO : PROGRESS: at 13.98% examples, 1130832 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:40,094 : INFO : PROGRESS: at 32.37% examples, 1257027 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:41,099 : INFO : PROGRESS: at 49.64% examples, 1273680 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:42,103 : INFO : PROGRESS: at 68.29% examples, 1303971 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:43,108 : INFO : PROGRESS: at 86.07% examples, 1302467 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:43,882 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-25 15:56:43,884 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-25 15:56:43,891 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-25 15:56:43,898 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-25 15:56:43,899 : INFO : training on 10615480 raw words (7502987 effective words) took 5.8s, 1291613 effective words/s

for k, s in model.most_similar(positive=[u"圣后"]):
    print(k, s)

e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  """Entry point for launching an IPython kernel.
2017-12-25 15:58:02,828 : INFO : precomputing L2-norms of word weight vectors


胜雪 0.806833028793335
皇后 0.7890491485595703
承武 0.7554647326469421
承文 0.7192850708961487
家 0.7164201736450195
沾衣 0.6703976392745972
朝 0.6002320051193237
先帝 0.5968987345695496
白帝 0.5489777326583862
旨意 0.5485547184944153