gensim入門

import gensim
from gensim import corpora
from pprint import pprint
from collections import defaultdict

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey"
]
# ストップワードの定義
stop_words = set('for a of the and to in'.split())

# 各ドキュメントをストップワードの含まれない単語の配列に
texts = [[word for word in document.lower().split() if word not in stop_words] for document in documents]
# [
    # ['human', 'machine', 'interface', ..],
    # ['survey', 'user', 'opinion', 'computer', ..

# 単語の出現回数を格納するためdefaultdict型を用意
frequency = defaultdict(int)

# 単語の出現回数をfrequency変数でカウント
for text in texts:
    for token in text:
        frequency[token] += 1
# defaultdict(<class 'int'>,
#     {'abc': 1,
#     'applications': 1,
#     'binary': 1,
#     'computer': 2, ....

# 全文章のうち、1回しか出現しなかった単語は除外
texts = [[token for token in text if frequency[token] > 1] for text in texts]

# 辞書の作成。ここでいう辞書は単語ID、単語、単語出現回数を持つデータのこと
dictionary = corpora.Dictionary(texts)

# 辞書はファイルに保存することもできる
# dictionary.save('/tmp/deerwester.dict')
# テキストファイルとして保存することも可能
# dictionary.save_as_text('/tmp/deerwester.dict.txt')

# コーパスの作成。ここでいうコーパスは文章ごとに「単語ID・出現頻度」タプル配列を持つデータのこと
corpus = [dictionary.doc2bow(text) for text in texts]
# [[(0, 1), (1, 1), (2, 1)],
#  [(2, 1), (3, 1), ...]]
# ファイルに保存する場合
# corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

# num_topics=5で、5個のトピックを持つLDAモデルを作成
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)
pprint(lda.show_topics())
# [(0,
#   '0.143*"human" + 0.143*"user" + 0.142*"time" + 0.142*"interface" + '
#   '0.142*"response" + 0.142*"computer" + 0.025*"trees" + 0.025*"graph" + '
#   '0.024*"system" + 0.024*"minors"'),
#  (1,
#   '0.341*"system" + 0.187*"eps" + 0.186*"human" + 0.033*"trees" + '
#   '0.032*"graph" + 0.032*"computer" + 0.032*"minors" + 0.032*"survey" + ' ......


# 新しい文を定義
test_documents = ["Computer themselves and software yet to be developed will revolutionize the way we learn"]

test_texts = [[word for word in document.lower().split()] for document in test_documents]

test_corpus = [dictionary.doc2bow(text) for text in test_texts]

pprint(test_corpus)

for topics_per_document in lda[test_corpus]:
    pprint(topics_per_document)