DL|深度学总结（skip-gram pytorch实现） skip-gram|pytorch实现|subsamping|ne

文章目录

skip-gram pytorch 朴素实现
- 网络结构
- 训练过程:使用nn.NLLLoss()
- batch的准备，为unsupervised，准备数据获取（center,contex)的pair：
- 采样时的优化：Subsampling降低高频词的概率
skip-gram 进阶：negative sampling
- 一般都是针对计算效率优化的方法：negative sampling和hierachical softmax
- negative sampling实现：
- - negative sampling原理：
  - negative sampling抽样方法：
  - negative sampling前向传递过程：
  - negative sampling训练过程：

skip-gram pytorch 朴素实现网络结构

class SkipGram(nn.Module): def __init__(self, n_vocab, n_embed): super().__init__()self.embed = nn.Embedding(n_vocab, n_embed) self.output = nn.Linear(n_embed, n_vocab) self.log_softmax = nn.LogSoftmax(dim=1)def forward(self, x): x = self.embed(x) scores = self.output(x) log_ps = self.log_softmax(scores)return log_ps

训练过程:使用nn.NLLLoss()

# check if GPU is available device = 'cuda' if torch.cuda.is_available() else 'cpu'embedding_dim=300 # you can change, if you wantmodel = SkipGram(len(vocab_to_int), embedding_dim).to(device) criterion = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=0.003)print_every = 500 steps = 0 epochs = 5# train for some number of epochs for e in range(epochs):# get input and target batches for inputs, targets in get_batches(train_words, 512): steps += 1 inputs, targets = torch.LongTensor(inputs), torch.LongTensor(targets) inputs, targets = inputs.to(device), targets.to(device)log_ps = model(inputs) loss = criterion(log_ps, targets) optimizer.zero_grad() loss.backward() optimizer.step()

batch的准备，为unsupervised，准备数据获取（center,contex)的pair：

def get_target(words, idx, window_size=5): ''' Get a list of words in a window around an index. '''R = np.random.randint(1, window_size+1) start = idx - R if (idx - R) > 0 else 0 stop = idx + R target_words = words[start:idx] + words[idx+1:stop+1]return list(target_words) def get_batches(words, batch_size, window_size=5): ''' Create a generator of word batches as a tuple (inputs, targets) '''n_batches = len(words)//batch_size# only full batches words = words[:n_batches*batch_size]for idx in range(0, len(words), batch_size): x, y = [], [] batch = words[idx:idx+batch_size] for ii in range(len(batch)): batch_x = batch[ii] batch_y = get_target(batch, ii, window_size) y.extend(batch_y) x.extend([batch_x]*len(batch_y)) yield x, y

采样时的优化：Subsampling降低高频词的概率 Words that show up often such as “the”, “of”, and “for” don’t provide much context to the nearby words. If we discard some of them, we can remove some of the noise from our data and in return get faster training and better representations. This process is called subsampling by Mikolov. For each wordw i w_i wi? in the training set, we’ll discard it with probability given by
P ( w i ) = 1 ? t f ( w i ) P(w_i) = 1 - \sqrt{\frac{t}{f(w_i)}} P(wi?)=1?f(wi?)t? ?
wheret t t is a threshold parameter andf ( w i ) f(w_i) f(wi?) is the frequency of wordw i w_i wi? in the total dataset.

from collections import Counter import random import numpy as npthreshold = 1e-5 word_counts = Counter(int_words) #print(list(word_counts.items())[0])# dictionary of int_words, how many times they appeartotal_count = len(int_words) freqs = {word: count/total_count for word, count in word_counts.items()} p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts} # discard some frequent words, according to the subsampling equation # create a new list of words for training train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

skip-gram 进阶：negative sampling 一般都是针对计算效率优化的方法：negative sampling和hierachical softmax

文章图片

negative sampling实现： negative sampling原理：

文章图片

class NegativeSamplingLoss(nn.Module): def __init__(self): super().__init__()def forward(self, input_vectors, output_vectors, noise_vectors):batch_size, embed_size = input_vectors.shape# Input vectors should be a batch of column vectors input_vectors = input_vectors.view(batch_size, embed_size, 1)# Output vectors should be a batch of row vectors output_vectors = output_vectors.view(batch_size, 1, embed_size)# bmm = batch matrix multiplication # correct log-sigmoid loss out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log() out_loss = out_loss.squeeze()# incorrect log-sigmoid loss noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log() noise_loss = noise_loss.squeeze().sum(1)# sum the losses over the sample of noise vectors# negate and sum correct and noisy log-sigmoid losses # return average batch loss return -(out_loss + noise_loss).mean()

negative sampling抽样方法：
【DL|深度学总结（skip-gram pytorch实现）】

文章图片

# Get our noise distribution # Using word frequencies calculated earlier in the notebook word_freqs = np.array(sorted(freqs.values(), reverse=True)) unigram_dist = word_freqs/word_freqs.sum() noise_dist = torch.from_numpy(unigram_dist**(0.75)/np.sum(unigram_dist**(0.75)))

negative sampling前向传递过程：

class SkipGramNeg(nn.Module): def __init__(self, n_vocab, n_embed, noise_dist=None): super().__init__()self.n_vocab = n_vocab self.n_embed = n_embed self.noise_dist = noise_dist# define embedding layers for input and output words self.in_embed = nn.Embedding(n_vocab, n_embed) self.out_embed = nn.Embedding(n_vocab, n_embed)# Initialize embedding tables with uniform distribution # I believe this helps with convergence self.in_embed.weight.data.uniform_(-1, 1) self.out_embed.weight.data.uniform_(-1, 1)def forward_input(self, input_words): input_vectors = self.in_embed(input_words) return input_vectorsdef forward_output(self, output_words): output_vectors = self.out_embed(output_words) return output_vectorsdef forward_noise(self, batch_size, n_samples): """ Generate noise vectors with shape (batch_size, n_samples, n_embed)""" if self.noise_dist is None: # Sample words uniformly noise_dist = torch.ones(self.n_vocab) else: noise_dist = self.noise_dist# Sample words from our noise distribution noise_words = torch.multinomial(noise_dist, batch_size * n_samples, replacement=True)device = "cuda" if model.out_embed.weight.is_cuda else "cpu" noise_words = noise_words.to(device)noise_vectors = self.out_embed(noise_words).view(batch_size, n_samples, self.n_embed)return noise_vectors

negative sampling训练过程：

device = 'cuda' if torch.cuda.is_available() else 'cpu'# Get our noise distribution # Using word frequencies calculated earlier in the notebook word_freqs = np.array(sorted(freqs.values(), reverse=True)) unigram_dist = word_freqs/word_freqs.sum() noise_dist = torch.from_numpy(unigram_dist**(0.75)/np.sum(unigram_dist**(0.75)))# instantiating the model embedding_dim = 300 model = SkipGramNeg(len(vocab_to_int), embedding_dim, noise_dist=noise_dist).to(device)# using the loss that we defined criterion = NegativeSamplingLoss() optimizer = optim.Adam(model.parameters(), lr=0.003)print_every = 1500 steps = 0 epochs = 5# train for some number of epochs for e in range(epochs):# get our input, target batches for input_words, target_words in get_batches(train_words, 512): steps += 1 inputs, targets = torch.LongTensor(input_words), torch.LongTensor(target_words) inputs, targets = inputs.to(device), targets.to(device)# input, output, and noise vectors input_vectors = model.forward_input(inputs) output_vectors = model.forward_output(targets) noise_vectors = model.forward_noise(inputs.shape[0], 5)# negative sampling loss loss = criterion(input_vectors, output_vectors, noise_vectors)optimizer.zero_grad() loss.backward() optimizer.step()