import numpy as np from sklearn.cluster import KMeans from sklearn import metrics from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer
corpus = items['split_title'}.tolist()#将文章标题视作TF-IDF的语料 vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i}[j} 表示j词在i类文本下的词频 transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值 tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i}[j}表示j词在i类文本中的tf-idf权重
对TFIDF嵌入处理后的向量进行PCA降维(降至2维,方便绘图)
1 2 3 4 5 6 7 8 9 10 11
from sklearn.cluster import KMeans from sklearn.decomposition import PCA