pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gensim

# -*- coding: utf-8 -*-
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora

documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
e:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
2017-12-25 14:51:28,440 : INFO : 'pattern' package not found; tag filters are not available for English
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]
import jieba

content = """面对当前挑战,我们应该落实2030年可持续发展议程,促进包容性发展"""
content = list(jieba.cut(content, cut_all=False))
for word in content:
    print(word)
Building prefix dict from the default dictionary ...
2017-12-25 14:51:32,170 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\THREEP~1\AppData\Local\Temp\jieba.cache
2017-12-25 14:51:32,178 : DEBUG : Loading model from cache C:\Users\THREEP~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.710 seconds.
2017-12-25 14:51:32,881 : DEBUG : Loading model cost 0.710 seconds.
Prefix dict has been built succesfully.
2017-12-25 14:51:32,883 : DEBUG : Prefix dict has been built succesfully.


面对
当前
挑战
,
我们
应该
落实
2030
年
可
持续
发展
议程
,
促进
包容性
发展
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (15.0, 8.0)
% matplotlib inline

import numpy as np
import pandas as pd

from scipy.misc import imread
from wordcloud import WordCloud
name = "docs/assets/择天记.txt"
with open(name, encoding="utf-8") as fp:
    book = fp.read()

with open(name, encoding="utf-8") as fp:
    lines = [line for line in fp.readlines() if len(line) > 2]

print(book[:100])
print(lines[:10])
择天记

猫腻

玄幻奇幻

太始元年,有神石自太空飞来,分散落在人间,其中落在东土大陆的神石,上面镌刻着奇怪的图腾,人因观其图腾而悟道,后立国教。
数千年后,十四岁的少年孤儿陈长生,为治病改命离开自
['择天记\n', '猫腻\n', '玄幻奇幻\n', '太始元年,有神石自太空飞来,分散落在人间,其中落在东土大陆的神石,上面镌刻着奇怪的图腾,人因观其图腾而悟道,后立国教。\n', '数千年后,十四岁的少年孤儿陈长生,为治病改命离开自己的师父,带着一纸婚约来到神都,从而开启了一个逆天强者的崛起征程。\n', '各位书友要是觉得《择天记》还不错的话请不要忘记向您QQ群和微博里的朋友推荐哦!\n', '序 下山\n', '世界是相对的。\n', '中土大6隔着海洋与大西洲遥遥相对。东方地势较高,那里的天空似乎也高了起来,云雾从海上6地上升腾而起,不停向着那处飘去,最终汇聚在一起,终年不散。\n', '这里便是云墓——世间所有云的坟墓。\n']
import jieba
jieba.add_word("陈长生",tag="nz")
jieba.add_word("徐有容",tag="nz")
jieba.add_word("落落",tag="nz")
jieba.add_word("小黑龙",tag="nz")

segments = [seg for seg in jieba.cut(book) if len(seg) > 1]
df = pd.DataFrame({'segment': segments})
df
segment
0 择天记
1 猫腻
2 玄幻
3 奇幻
4 太始
5 元年
6 有神
7 石自
8 太空
9 飞来
10 分散
11 人间
12 其中
13 东土
14 大陆
15 神石
16 上面
17 镌刻
18 奇怪
19 图腾
20 因观
21 图腾
22 悟道
23 立国
24 数千年
25 十四岁
26 少年
27 孤儿
28 陈长生
29 治病
... ...
1024912 常见
1024913 宫廷
1024914 故事
1024915 陈长生
1024916 说道
1024917 我要
1024918 圣城
1024919 我们
1024920 可能
1024921 顺路
1024922 铁面人
1024923 焦急
1024924 说道
1024925 一定
1024926 顺路
1024927 一定
1024928 顺路
1024929 就算
1024930 地狱
1024931 毫不犹豫
1024932 跟随
1024933 脚步
1024934 陈长生
1024935 说道
1024936 如果
1024937 我要
1024938 地方
1024939 神国
1024940 全文
1024941 本章

1024942 rows × 1 columns

stopwords = pd.read_csv(u"docs/assets/stop_words.txt")
df = df[~df.segment.isin(stopwords.stopword)]
df
segment
0 择天记
1 猫腻
2 玄幻
3 奇幻
4 太始
5 元年
6 有神
7 石自
8 太空
9 飞来
10 分散
11 人间
13 东土
14 大陆
15 神石
16 上面
17 镌刻
18 奇怪
19 图腾
20 因观
21 图腾
22 悟道
23 立国
24 数千年
25 十四岁
26 少年
27 孤儿
28 陈长生
29 治病
30 改命
... ...
1024910 推演出来
1024911 一个
1024912 常见
1024913 宫廷
1024914 故事
1024915 陈长生
1024916 说道
1024917 我要
1024918 圣城
1024920 可能
1024921 顺路
1024922 铁面人
1024923 焦急
1024924 说道
1024925 一定
1024926 顺路
1024927 一定
1024928 顺路
1024929 就算
1024930 地狱
1024931 毫不犹豫
1024932 跟随
1024933 脚步
1024934 陈长生
1024935 说道
1024937 我要
1024938 地方
1024939 神国
1024940 全文
1024941 本章

905949 rows × 1 columns

segStat = df.groupby(by=["segment"])["segment"].agg({"count": np.size}).reset_index().sort_values(by=["count"], ascending=False);
segStat.head(20)
e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.
segment count
49602 陈长生 15939
31806 没有 13387
44735 说道 10080
36711 看着 6958
37136 知道 6245
19516 已经 4787
14009 国教 4271
4017 事情 4187
2416 不是 3760
1568 三十六 3686
17320 学院 3575
21252 徐有容 3339
21111 很多 3017
35031 现在 2969
543 一个 2819
41362 能够 2416
12033 可能 2403
20163 应该 2352
5529 仿佛 2170
36634 看到 2137
back_coloring = imread(u"docs/assets/mask.jpg")

wordcloud = WordCloud(font_path=u"docs/assets/wqywmh.ttf", background_color="white", mask=back_coloring)
plt.axis("off")
wordcloud = wordcloud.fit_words(dict([(s, g) for s, g in segStat.head(20).itertuples(index=False)]))

plt.imshow(wordcloud)
<matplotlib.image.AxesImage at 0x2c0c23986a0>

png

import jieba.posseg as pseg

words = pseg.cut(book[:100])
for w in words:
    if w.flag == 'nr':
        print(w.word, w.flag, w)
玄幻 nr 玄幻/nr
石自 nr 石自/nr
孤儿 nr 孤儿/nr
sentences = []
for line in lines:
    words = list(jieba.cut(line))
    sentences.append(words)
考虑更加细腻的sentence
import gensim
model = gensim.models.Word2Vec(sentences,size=200,window=5,min_count=5,workers=4)
2017-12-25 15:56:37,001 : INFO : collecting all words and their counts
2017-12-25 15:56:37,003 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-25 15:56:37,065 : INFO : PROGRESS: at sentence #10000, processed 260578 words, keeping 18494 word types
2017-12-25 15:56:37,123 : INFO : PROGRESS: at sentence #20000, processed 542096 words, keeping 27616 word types
2017-12-25 15:56:37,177 : INFO : PROGRESS: at sentence #30000, processed 847726 words, keeping 35161 word types
2017-12-25 15:56:37,232 : INFO : PROGRESS: at sentence #40000, processed 1116154 words, keeping 40354 word types
2017-12-25 15:56:37,275 : INFO : PROGRESS: at sentence #50000, processed 1347282 words, keeping 44113 word types
2017-12-25 15:56:37,325 : INFO : PROGRESS: at sentence #60000, processed 1570782 words, keeping 47817 word types
2017-12-25 15:56:37,376 : INFO : PROGRESS: at sentence #70000, processed 1784533 words, keeping 51083 word types
2017-12-25 15:56:37,415 : INFO : PROGRESS: at sentence #80000, processed 1967748 words, keeping 53288 word types
2017-12-25 15:56:37,450 : INFO : collected 55415 word types from a corpus of 2123096 raw words and 87625 sentences
2017-12-25 15:56:37,451 : INFO : Loading a fresh vocabulary
2017-12-25 15:56:37,599 : INFO : min_count=5 retains 16452 unique words (29% of original 55415, drops 38963)
2017-12-25 15:56:37,600 : INFO : min_count=5 leaves 2060983 word corpus (97% of original 2123096, drops 62113)
2017-12-25 15:56:37,749 : INFO : deleting the raw counts dictionary of 55415 items
2017-12-25 15:56:37,752 : INFO : sample=0.001 downsamples 40 most-common words
2017-12-25 15:56:37,754 : INFO : downsampling leaves estimated 1500692 word corpus (72.8% of prior 2060983)
2017-12-25 15:56:37,755 : INFO : estimated required memory for 16452 words and 200 dimensions: 34549200 bytes
2017-12-25 15:56:37,845 : INFO : resetting layer weights
2017-12-25 15:56:38,082 : INFO : training model with 4 workers on 16452 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-12-25 15:56:39,091 : INFO : PROGRESS: at 13.98% examples, 1130832 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:40,094 : INFO : PROGRESS: at 32.37% examples, 1257027 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:41,099 : INFO : PROGRESS: at 49.64% examples, 1273680 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:42,103 : INFO : PROGRESS: at 68.29% examples, 1303971 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:43,108 : INFO : PROGRESS: at 86.07% examples, 1302467 words/s, in_qsize 7, out_qsize 0
2017-12-25 15:56:43,882 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-25 15:56:43,884 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-25 15:56:43,891 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-25 15:56:43,898 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-25 15:56:43,899 : INFO : training on 10615480 raw words (7502987 effective words) took 5.8s, 1291613 effective words/s
for k, s in model.most_similar(positive=[u"圣后"]):
    print(k, s)
e:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  """Entry point for launching an IPython kernel.
2017-12-25 15:58:02,828 : INFO : precomputing L2-norms of word weight vectors


胜雪 0.806833028793335
皇后 0.7890491485595703
承武 0.7554647326469421
承文 0.7192850708961487
家 0.7164201736450195
沾衣 0.6703976392745972
朝 0.6002320051193237
先帝 0.5968987345695496
白帝 0.5489777326583862
旨意 0.5485547184944153