CS224n Assignment1:exploring_word_vectors NLP

导入所需的库文件

# All Import Statements Defined Here # Note: Do not add to this list. # All the dependencies you need, can be installed by running . # ----------------import sys assert sys.version_info[0]==3 assert sys.version_info[1] >= 5from gensim.models import KeyedVectors from gensim.test.utils import datapath import pprint import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = [10, 5]#导入nltk，并下载路透社新闻数据集 import nltk nltk.download('reuters') from nltk.corpus import reuters import numpy as np import random import scipy as sp#降维 from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import PCA#设置TOKEN START_TOKEN = '' END_TOKEN = ''#设置相同的seed，使每次生成的随机数相同 np.random.seed(0) random.seed(0) # ----------------

放了方便后续的操作，需要进行输入的预处理

#定义read_corpus函数，在输入语句的前后分别加 START 和 END ,并将所有的字母改为小写 def read_corpus(category="crude"): """ Read files from the specified Reuter's category. Params: category (string): category name Return: list of lists, with words from each of the processed files """ files = reuters.fileids(category) return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]

Flatten a list of lists in one line in python 如何将两个列表压成一个列表的形式，一般可以使用循环的方式

list_of_lists = [range(4),range(5)] print (list_of_lists[:]) flattened_list = []for x in list_of_lists: for y in x: flattened_list.append(y)print(flattened_list[:])

也可以使用一行代码完成上面的工作，达到相同的效果

list_of_lists = [range(4),range(5)] print (list_of_lists[:]) flattened_list = []flattened_list = [y for x in list_of_lists for y in x] print (flattened_list[:])

1.1 Implement distinct_words

def distinct_words(corpus): """ Determine a list of distinct words for the corpus. Params: corpus (list of list of strings): corpus of documents Return: corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function) num_corpus_words (integer): number of distinct words across the corpus """ corpus_words = [] num_corpus_words = -1# ------------------ # Write your implementation here.#首先将corpus中的多个list压成一个list的形式 corpus = [w for _ in corpus for w in _] #利用set中不含重复元素的性质保留corpus中不同的单词，再使用lsit()将结果转化为list的形式 corpus_words = list(set(corpus)) #使用内置的排序函数sorted()按字母的升序方式进行排序 corpus_words = sorted(corpus_words) #使用len()获取list的长度 num_corpus_words = len(corpus_words)# ------------------ print (corpus_words[:],num_corpus_words) return corpus_words, num_corpus_words

Question 1.2: Implement compute_co_occurrence_matrix

def compute_co_occurrence_matrix(corpus, window_size=4): """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).Note: Each word in a document should be at the center of a window. Words near edges will have a smaller number of co-occurring words.For example, if we take the document "START All that glitters is not gold END" with window size of 4, "All" will co-occur with "START", "that", "glitters", "is", and "not".Params: corpus (list of list of strings): corpus of documents window_size (int): size of context window Return: M (numpy matrix of shape (number of corpus words, number of corpus words)): Co-occurence matrix of word counts. The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function. word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M. """ words, num_words = distinct_words(corpus) M = None word2Ind = {}# ------------------ # Write your implementation here.#定义一个空的词共现矩阵，这里采用零矩阵，因为M为对称阵，所以尺寸为num_words * num_words M = np.zeros(shape = (num_words,num_words),dtype = np.int32)#建立words中词和索引的映射关系，将其存到字典word2Int for i in range(num_words): word2Ind[words[i]] = i#对corpus中的每一部分分别进行处理 for sent in corpus: for p in range(len(sent)): #找到当前sent中的词在word2Ind中的索引 ci = word2Ind[sent[p]]#前 #因为某些位置前面词的个数可能会小于window_size，所以如果个数小与window_size就从头开始 for w in sent[max(0,p-window_size):p]: wi = word2Ind[w] M[ci][wi] += 1 #后 for w in sent[p + 1:p + 1 + window_size]: wi = word2Ind[w] M[ci][wi] += 1# ------------------return M, word2Ind

Question 1.3: Implement reduce_to_k_dim
sklearn.decomposition.TruncatedSVD 通过截断奇异值分解实现降维操作，
参数

n_components : int, default = 2
algorithm : string, default = “randomized”
n_iter : int, optional (default 5)
random_state : int, RandomState instance or None, optional, default = None
tol : float, optional

属性

components_ : array, shape (n_components, n_features)
explained_variance_ : array, shape (n_components,)
The variance of the training samples transformed by a projection to each component.
explained_variance_ratio_ : array, shape (n_components,)
Percentage of variance explained by each of the selected components.
【CS224n Assignment1:exploring_word_vectors】singular_values_ : array, shape (n_components,)
The singular values corresponding to each of the selected components. The singular values are equal to the 2-norms of the n_components variables in the lower-dimensional space.

def reduce_to_k_dim(M, k=2): """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words) to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn: - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.htmlParams: M (numpy matrix of shape (number of corpus words, number of corpus words)): co-occurence matrix of word counts k (int): embedding size of each word after dimension reduction Return: M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings. In terms of the SVD from math class, this actually returns U * S """ n_iters = 10# Use this parameter in your call to `TruncatedSVD` M_reduced = None print("Running Truncated SVD over %i words..." % (M.shape[0]))# ------------------ # Write your implementation here. #导入TruncatedSVD函数 svd = TruncatedSVD(n_components = k) svd.fit(M.T) M_reduced = svd.components_.T# ------------------print("Done.") return M_reduced

Question 1.4: Implement plot_embeddings

def plot_embeddings(M_reduced, word2Ind, words): """ Plot in a scatterplot the embeddings of the words specified in the list "words". NOTE: do not plot all the words listed in M_reduced / word2Ind. Include a label next to each point.Params: M_reduced (numpy matrix of shape (number of unique words in the corpus , k)): matrix of k-dimensioal word embeddings word2Ind (dict): dictionary that maps word to indices for matrix M words (list of strings): words whose embeddings we want to visualize """# ------------------ # Write your implementation here. for _ in words: x = M_reduced[word2Ind[_]][0] y = M_reduced[word2Ind[_]][1] #画散点图 plt.scatter(x,y,marker= 'x') plt.text(x,y,_) plt.show()# ------------------

Question 2.1: Word2Vec Plot Analysis

words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela'] plot_embeddings(M_reduced, word2Ind, words)

Question 2.2: Polysemous Words

wv_from_bin.most_similar("leaves")

Question 2.3: Synonyms & Antonyms

w1 = "happy" w2 = "cheerful" w3 = "sad" w1_w2_dist = wv_from_bin.distance(w1, w2) w1_w3_dist = wv_from_bin.distance(w1, w3)print("Synonyms {}, {} have cosine distance: {}".format(w1, w2, w1_w2_dist)) print("Antonyms {}, {} have cosine distance: {}".format(w1, w3, w1_w3_dist))

Question 2.4: Finding Analogies

pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'king'], negative=['man']))

Question 2.5: Incorrect Analogy

pprint.pprint(wv_from_bin.most_similar(positive=['woman','him'], negative=['man']))

Question 2.6: Guided Analysis of Bias in Word Vectors

pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'boss'], negative=['man'])) print() pprint.pprint(wv_from_bin.most_similar(positive=['man', 'boss'], negative=['woman']))

Question 2.7: Independent Analysis of Bias in Word Vectors

pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'doctor'], negative=['man'])) print() pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'doctor'], negative=['man']))