class VectorSpaceModel: def __init__(self, path_to_data, path_to_stopwords=None): """Initialize the Vector Space Model Args: path_to_data (str): path to the data path_to_stopwords (str, optional): path to the stopwords. Defaults to None. """ self.bag_of_words = [] # 词袋 self.token2id = {} self.idfs = {} self.tfs = [] self.docs = load_data(path_to_data, path_to_stopwords) # 读取数据 self.num_docs = len(self.docs) self.similarity = [[0.0 for i in range(self.num_docs)] for j in range(self.num_docs)] # 初始化相似度矩阵 self.initialize_bow() # 初试化词袋 self.compute_tfs() # 计算tf self.compute_idfs() # 计算idf self.compute_tfidfs() # 计算tf-idf def initialize_bow(self): """Generate the bag of words for given data """ for doc in self.docs: counter = defaultdict(int) for word in doc: if word not in self.token2id: self.token2id[word] = len(self.token2id) # 为词进行编号 counter[self.token2id[word]] += 1 self.bag_of_words.append(counter)
def compute_tfs(self): """Compute tfs for given data """ for bow in self.bag_of_words: total_fs = sum(bow.values()) self.tfs.append( {termid: fs / total_fs for termid, fs in bow.items()})
def df2idf(docfreq, totaldocs, log_base=2.0): """Compute inverse document frequency(idf) for term t.
Args: docfreq (int): number of documents where the term t appears totaldocs (int): total number of documents log_base (float, optional): Defaults to 2.0.
def compute_idfs(self): """Compute idfs for given data """ dfs = {} for bow in self.bag_of_words: for termid in bow: dfs[termid] = dfs.get(termid, 0) + 1 self.idfs = { termid: df2idf(df, self.num_docs) for termid, df in dfs.items() }
计算TF-IDF
\[tfidf_{i,j} = tf_{i,j} * idf_{i}\]
1 2 3 4 5 6 7
def compute_tfidfs(self): """Compute tf-idf """ self.tfidfs = [{ termid: tf * self.idfs.get(termid) for termid, tf in adoc_tfs.items() } for adoc_tfs in self.tfs]
def compute_similarity_subprocess(self, processes=1, i=0): """Subprocess function for computing the cosine similarity
Args: processes (int, optional): processes number. Defaults to 1. i (int, optional): process id. Defaults to 0. """ similarity = [] length = math.ceil(self.num_docs / processes) for idfs_1 in self.tfidfs[length * i:length * (i + 1)]: # 该进程负责运算的范围 similarity.append([]) for idfs_2 in self.tfidfs: dot_product = compute_dot_product(idfs_1, idfs_2) # 分子 similarity[-1].append(dot_product / (norm(idfs_1) * norm(idfs_2))) return similarity
def compute_similarity_multiprocess(self, processes): """Multiprocess implement of the cosine similarity
Args: processes (int): processes number. """ res = [] # 记录每个进程运算的结果 with Pool(processes) as pool: for i in range(processes): res.append( pool.apply_async(self.compute_similarity_subprocess, (processes, i))) pool.close() # 关闭进程池 pool.join() similarity = [] for i in range(processes): similarity += res[i].get() # 整合进程运算结果 self.similarity = similarity