想象一下写一个句子并找到一个类似意思的句子会多么方便。为此,您需要能够对整个句子进行矢量化处理,这可能是一项非常艰巨的任务。
根据我的工作细节,我必须搜索对支持服务的类似请求,即使标记相当大,也可能很难收集所需的与主题相关但以不同文字书写的消息数量。
下面是对矢量化整个句子的方式进行的概述研究,不仅考虑矢量化,还考虑了句子的含义对矢量化句子的尝试。
例如,两个短语“ epl优于三星”中的“三星比epl更好”应该在向量值之一的另一端,但同时在其他向量值中重合。
可以用下面的图片进行类比。从杯形蛋糕到狗的规模,它们位于不同的末端,并且根据黑点的数量和物体的颜色,它们合而为一。
文章中的方法非常简单且有趣,但是缺点是:
- 他们用英语测试
- 在每篇文章中都写到他们已经超越了前辈,但是比较是在不同的数据集上进行的,因此无法进行评级
— 7 .
- BOW
1.1. BOW
1.2. BOW c
1.3. BOW
1.4. LDA - ,
2.1
2.2
2.3 tf-idf - Languade Models
3.1 Language Model on embedings
3.2 Language Model on index - BERT
4.1 rubert_cased_L-12_H-768_A-12_pt
4.2 ru_conversational_cased_L-12_H-768_A-12_pt
4.3 sentence_ru_cased_L-12_H-768_A-12_pt
4.4 elmo_ru-news_wmt11-16_1.5M_steps
4.5 elmo_ru-wiki_600k_steps
4.6 elmo_ru-twitter_2013-01_2018-04_600k_steps -
5.1 embedings -> embedings
5.2 embedings -> indexes
5.3 LSTM -> LSTM
5.4 LSTM -> LSTM -> indexes - Transfer Learning
6.1 BOW
6.2 LSTM + MaxPooling
6.3 LSTM + Conv1D + AveragePooling
6.4 LSTM + Inception + Attention - Triplet loss
7.1 Triplet loss BOW
7.2 Triplet loss embedings
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import random
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.decomposition import LatentDirichletAllocation
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, MaxPooling1D, AveragePooling1D, Conv1D
from tensorflow.keras.layers import Flatten, Reshape, Concatenate, Permute, Activation, Dropout, multiply
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.losses import cosine_similarity
from tensorflow.keras import regularizers
import tensorflow.keras.backend as K
import tensorflow as tf
import pymorphy2
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import re
from conllu import parse_incr
files = {'train': 'ru_syntagrus-ud-train.conllu',
'test': 'ru_syntagrus-ud-test.conllu',
'dev': 'ru_syntagrus-ud-dev.conllu'}
database = {}
for data_type in files:
filename = files[data_type]
database = {}
with open(os.path.join('UD_Russian-SynTagRus-master', filename), encoding='utf-8') as f:
parsed = parse_incr(f)
for token_list in parsed:
topic_name = token_list.metadata['sent_id'].split('.')[0]
#
topic_name = re.sub(r'\d+', '', topic_name)
if topic_name not in database:
database[topic_name] = []
sentence = ' '.join([token['form'] for token in token_list]).lower()
database[topic_name].append(sentence)
, .
choosen_for_evaluation = ['I_slepye_prozreyut',
'Interviyu_Mariny_Astvatsaturyan',
'Byudzhet']
texts_for_evaluation = {}
texts_for_training = {}
for topic in database:
if topic in choosen_for_evaluation:
texts_for_evaluation[topic] = database[topic]
else:
texts_for_training[topic] = database[topic]
TEXTS_CORPUS = [sentence for topic in texts_for_training for sentence in texts_for_training[topic]]
#
for topic in texts_for_evaluation:
print(topic, len(texts_for_evaluation[topic]))
for index, sentence in enumerate(texts_for_evaluation[topic]):
print('\t', sentence[:100])
if index > 5:
break
print('\n')
Byudzhet 70
. .
.
ii .
, " " .
,
, , … -
" " ,
Interviyu_Mariny_Astvatsaturyan 72
" " .
- , , ? ?
- , 1993- .
" "
, , -
.
" " ,
I_slepye_prozreyut 72
…
, ,
.
,
, , .
get_similarity_values, . .
: 3 70, 72 72 . , , . . .. , +214 , , -213 .. .
= sum(214… 214 — 72) — sum(214-72… 0) + 7626 = 10395
= -sum(214… 72) + sum(72… 0) + 7626 = -10053
np.random.seed(42)
random.seed(777)
index2topic = {}
index2text = {}
index = 0
for topic in texts_for_evaluation:
for sentence in texts_for_evaluation[topic]:
index2topic[index] = topic
index2text[index] = sentence
index += 1
def get_similarity_values(sentences):
return np.random.rand(len(sentences), len(sentences))
chart_methods = {}
bottom_minimum = -7626.2336448598135
def evaluate(get_similarity_values, method_name=None, add_to_chart=True):
test_messages = [index2text[index] for index in range(len(index2text))]
distances_each_to_each = get_similarity_values(test_messages)
evaluations = []
for target_index in index2topic:
distances = distances_each_to_each[target_index]
distances_indexes = sorted(zip(distances, range(len(index2topic))), key=lambda x: x[0])
evaluation_result = 0
for i, (distance, index) in enumerate(distances_indexes):
if index2topic[index] == index2topic[target_index]:
evaluation_result += len(test_messages) - i
else:
evaluation_result -= len(test_messages) - i
evaluations.append(evaluation_result)
# (baseline)
result = round(np.mean(evaluations) - bottom_minimum, 1)
if add_to_chart:
# ,
if method_name not in chart_methods or result > chart_methods[method_name][0]:
chart_methods[method_name] = (result, np.std(evaluations))
return f'{method_name}: {str(result)}'
def parse_result(result):
return float(new_result.split(': ')[1])
evaluate(get_similarity_values, 'random arrange')
'random arrange: 0.0'
1. BOW
1.1 BOW
, ( ).
count_vectorizer = CountVectorizer()
corpus = TEXTS_CORPUS
count_vectorizer.fit(corpus)
def get_similarity_values(sentences):
sentences_bow = count_vectorizer.transform(sentences)
distances = cosine_distances(sentences_bow, sentences_bow)
return distances
evaluate(get_similarity_values, 'BOW')
'BOW: 693.1'
1.2 BOW
, .
morph = pymorphy2.MorphAnalyzer()
def lemmatize(corpus, verbose=False):
clear_corpus = []
if verbose:
iterator = tqdm(corpus, leave=False)
else:
iterator = corpus
for sentence in iterator:
tokens = sentence.split() #
res = []
for token in tokens:
p = morph.parse(token)[0]
res.append(p.normal_form)
clear_corpus.append(' '.join(res))
return clear_corpus
count_vectorizer = CountVectorizer()
corpus = lemmatize(TEXTS_CORPUS, True)
count_vectorizer.fit(corpus)
def get_similarity_values(sentences):
sentences_bow = count_vectorizer.transform(lemmatize(sentences))
distances = cosine_distances(sentences_bow, sentences_bow)
return distances
evaluate(get_similarity_values, 'BOW ', False)
'BOW : 1645.8'
1.3 BOW
ru_stopwords = stopwords.words('russian')
ru_stopwords += ['.', ',', '"', '!',
'?','(', ')', '-',
':', ';', '_', '\\']
def delete_stopwords(corpus, verbose=False):
clear_corpus = []
if verbose:
iterator = tqdm(corpus, leave=False)
else:
iterator = corpus
for sentence in iterator:
tokens = sentence.split() #
res = []
without_stopwords = [token for token in tokens if token not in ru_stopwords]
clear_corpus.append(' '.join(without_stopwords))
return clear_corpus
count_vectorizer = CountVectorizer()
corpus = lemmatize(delete_stopwords(TEXTS_CORPUS), True)
count_vectorizer.fit(corpus)
def get_similarity_values(sentences):
sentences_bow = count_vectorizer.transform(lemmatize(delete_stopwords(sentences)))
distances = cosine_distances(sentences_bow, sentences_bow)
return distances
evaluate(get_similarity_values, 'BOW ')
'BOW : 1917.6'
1.4 LDA
def similarity_values_wrapper(lda, count_vectorizer, do_lemmatize=False, do_delete_stopwords=False):
def get_similarity_values(sentences):
if do_delete_stopwords:
sentences = delete_stopwords(sentences)
if do_lemmatize:
sentences = lemmatize(sentences)
sent_vector = count_vectorizer.transform(sentences)
sent_vector = lda.transform(sent_vector)
distances = cosine_distances(sent_vector, sent_vector)
return distances
return get_similarity_values
lda = LatentDirichletAllocation(n_components=300)
corpus = TEXTS_CORPUS
count_vectorizer = CountVectorizer().fit(corpus)
corpus = count_vectorizer.transform(corpus)
lda.fit(corpus)
get_similarity_values = similarity_values_wrapper(lda, count_vectorizer)
print(evaluate(get_similarity_values, 'LDA', False))
lda = LatentDirichletAllocation(n_components=300)
corpus = lemmatize(TEXTS_CORPUS, True)
count_vectorizer = CountVectorizer().fit(corpus)
corpus = count_vectorizer.transform(corpus)
lda.fit(corpus)
get_similarity_values = similarity_values_wrapper(lda, count_vectorizer, do_lemmatize=True)
print(evaluate(get_similarity_values, 'LDA ', True))
lda = LatentDirichletAllocation(n_components=300)
corpus = lemmatize(delete_stopwords(TEXTS_CORPUS), True)
count_vectorizer = CountVectorizer().fit(corpus)
corpus = count_vectorizer.transform(corpus)
lda.fit(corpus)
get_similarity_values = similarity_values_wrapper(lda, count_vectorizer, do_lemmatize=True, do_delete_stopwords=True)
print(evaluate(get_similarity_values, 'LDA ', False))
LDA: 344.7
LDA : 1092.1
LDA : 1077.2
%matplotlib inline
def plot_results():
methods = sorted(chart_methods.items(), key=lambda x: x[1][0])
labels = [m[0] for m in methods]
x_pos = np.arange(len(labels))
mean = [m[1][0] for m in methods]
std = [m[1][1] for m in methods]
# Build the plot
fig, ax = plt.subplots(figsize=(12,8))
ax.bar(x_pos,
mean,
yerr=std,
align='center',
alpha=0.5,
ecolor='black',
capsize=10)
ax.set_ylabel(' ')
ax.set_xticks(x_pos)
ax.set_xticklabels(labels, rotation=20, ha='right')
ax.set_title(' ')
ax.yaxis.grid(True)
plt.show()
plot_results()
2. ,
.
#
import fasttext.util
from wikipedia2vec import Wikipedia2Vec
fasttext.util.download_model('ru', if_exists='ignore')
wiki2vec = Wikipedia2Vec.load('ruwiki_20180420_300d.pkl')
ft = fasttext.load_model('cc.ru.300.bin')
2.1
def vectorize(token, use_word2vec=True, use_fasttext=True):
assert use_word2vec or use_fasttext
if use_fasttext:
try:
fast_text_vector = ft.get_word_vector(token)
except KeyError:
fast_text_vector = np.zeros((ft.get_dimension()))
if use_word2vec:
try:
word2vec_vector = wiki2vec.get_word_vector(token)
except KeyError:
word2vec_vector = np.zeros((len(wiki2vec.get_word_vector('the'))))
if use_fasttext and use_word2vec:
return np.concatenate([word2vec_vector, fast_text_vector])
elif use_fasttext:
return np.array(fast_text_vector)
elif use_word2vec:
return np.array(word2vec_vector)
else:
return 'something went wrong on vectorisation'
print(np.shape(vectorize('any_token')))
def similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=cosine_distances):
def get_similarity_values(sentences):
sent_vector = []
for sentence in sentences:
sentence_vector = []
for token in sentence.split():
sentence_vector.append(vectorize(token, use_word2vec, use_fasttext))
sent_vector.append(np.mean(sentence_vector, axis=0))
distances = distance_function(sent_vector, sent_vector)
return distances
return get_similarity_values
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=euclidean_distances)
print(evaluate(get_similarity_values, ' embedings euclidean_distances word2vec + fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True, distance_function=euclidean_distances)
print(evaluate(get_similarity_values, ' embedings euclidean_distances fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False, distance_function=euclidean_distances)
print(evaluate(get_similarity_values, ' embedings euclidean_distances word2vec'))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=cosine_distances)
print(evaluate(get_similarity_values, ' embedings cosine_distance word2vec + fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True, distance_function=cosine_distances)
print(evaluate(get_similarity_values, ' embedings cosine_distance fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False, distance_function=cosine_distances)
print(evaluate(get_similarity_values, ' embedings cosine_distance word2vec'))
embedings euclidean_distances word2vec + fast_text: 1833.6
embedings euclidean_distances fast_text: 913.5
embedings euclidean_distances word2vec: 1941.6
embedings cosine_distance word2vec + fast_text: 2278.1
c embedings cosine_distance fast_text: 829.2
embedings cosine_distance word2vec: 2437.7
2.2
def similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=cosine_distances):
def get_similarity_values(sentences):
sentences = delete_stopwords(sentences)
sent_vector = []
for sentence in sentences:
sentence_vector = []
for token in sentence.split():
sentence_vector.append(vectorize(token, use_word2vec, use_fasttext))
sent_vector.append(np.mean(sentence_vector, axis=0))
distances = distance_function(sent_vector, sent_vector)
return distances
return get_similarity_values
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=euclidean_distances)
print(evaluate(get_similarity_values, ' embedings euclidean_distances word2vec + fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True, distance_function=euclidean_distances)
print(evaluate(get_similarity_values, ' embedings euclidean_distances fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False, distance_function=euclidean_distances)
print(evaluate(get_similarity_values, ' embedings euclidean_distances word2vec'))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True)
print(evaluate(get_similarity_values, ' embedings cosine_distance word2vec + fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True)
print(evaluate(get_similarity_values, ' embedings cosine_distance fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False)
print(evaluate(get_similarity_values, ' embedings cosine_distance word2vec'))
embedings euclidean_distances word2vec + fast_text: 2116.9
embedings euclidean_distances fast_text: 1314.5
embedings euclidean_distances word2vec: 2159.1
embedings cosine_distance word2vec + fast_text: 2779.7
embedings cosine_distance fast_text: 2199.0
embedings cosine_distance word2vec: 2814.4
2.3 tf-idf
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_vectorizer.fit(TEXTS_CORPUS)
vocab = tf_idf_vectorizer.get_feature_names()
def similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=cosine_distances):
def get_similarity_values(sentences):
sent_vector = [[]]*len(sentences)
weights_data = tf_idf_vectorizer.transform(sentences).tocoo()
for row, col, weight in zip(weights_data.row, weights_data.col, weights_data.data):
sent_vector[row].append(weight*vectorize(vocab[col], use_word2vec, use_fasttext))
for row in range(len(sent_vector)):
if not sent_vector[row]:
sent_vector.append((len(vectorize('zoros_vector'))))
sent_vector = np.sum(sent_vector, axis=1)
distances = distance_function(sent_vector, sent_vector)
return distances
return get_similarity_values
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True)
print(evaluate(get_similarity_values,' embedings tf-idf cosine_distance word2vec + fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True)
print(evaluate(get_similarity_values,' embedings tf-idf cosine_distance fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False)
print(evaluate(get_similarity_values,' embedings tf-idf cosine_distance word2vec', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=True, distance_function=euclidean_distances)
print(evaluate(get_similarity_values,' embedings tf-idf euclidian_distance word2vec + fast_text', add_to_chart=True))
get_similarity_values = similarity_values_wrapper(use_word2vec=False, use_fasttext=True, distance_function=euclidean_distances)
print(evaluate(get_similarity_values,' embedings tf-idf euclidian_distance fast_text', add_to_chart=False))
get_similarity_values = similarity_values_wrapper(use_word2vec=True, use_fasttext=False, distance_function=euclidean_distances)
print(evaluate(get_similarity_values,' embedings tf-idf euclidian_distance word2vec', add_to_chart=False))
embedings tf-idf cosine_distance word2vec + fast_text: -133.6
embedings tf-idf cosine_distance fast_text: 9.0
embedings tf-idf cosine_distance word2vec: -133.6
embedings tf-idf euclidian_distance word2vec + fast_text: 6.4
embedings tf-idf euclidian_distance fast_text: -133.6
embedings tf-idf euclidian_distance word2vec: -133.6
plot_results()
, .
max_len = 20
min_len = 5
embedding_size = len(vectorize('any token'))
class EmbedingsDataGenerator():
def __init__(self, texts_corpus=TEXTS_CORPUS, min_len=5, max_len=20, batch_size=32, batches_per_epoch=100, use_word2vec=True, use_fasttext=True):
self.texts = texts_corpus
self.min_len = min_len
self.max_len = max_len
self.batch_size = batch_size
self.batches_per_epoch = batches_per_epoch
self.use_word2vec = use_word2vec
self.use_fasttext = use_fasttext
self.embedding_size = len(vectorize('token', use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext))
def vectorize(self, sentences):
vectorized_sentences = []
for text in sentences:
text_vec = []
tokens = str(text).split()
for token in tokens:
text_vec.append(vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext))
vectorized_sentences.append(text_vec)
vectorized_sentences = pad_sequences(vectorized_sentences, maxlen=self.max_len, dtype='float32')
return vectorized_sentences
def __iter__(self):
for _ in tqdm(range(self.batches_per_epoch), leave=False):
X_batch = []
y_batch = []
finished_batch = False
while not finished_batch:
text = random.choice(self.texts)
tokens = str(text).split()
if len(tokens) < self.min_len:
continue
x_vec = []
for token in tokens:
token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)
if len(x_vec) >= self.min_len:
X_batch.append(x_vec)
y_batch.append(token_vec)
if len(X_batch) == self.batch_size:
X_batch = pad_sequences(X_batch, maxlen=self.max_len, dtype='float32')
yield np.array(X_batch), np.array(y_batch)
finished_batch = True
break
x_vec.append(token_vec)
class IndexesDataGenerator(EmbedingsDataGenerator):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.token2index = {}
index = 0
for text in self.texts:
tokens = str(text).split()
for token in tokens:
if token not in self.token2index:
self.token2index[token] = index
index += 1
def __iter__(self):
for _ in tqdm(range(self.batches_per_epoch), leave=False):
X_batch = []
X_batch_indexes = []
y_batch = []
finished_batch = False
while not finished_batch:
text = random.choice(self.texts)
tokens = str(text).split()
if len(tokens) < self.min_len:
continue
x_vec = []
x_tokens = []
for token in tokens:
token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)
if len(x_vec) >= self.min_len:
X_batch.append(x_vec)
X_batch_indexes.append(to_categorical(x_tokens, num_classes=len(self.token2index)))
y_batch.append(self.token2index[token])
if len(X_batch) == self.batch_size:
X_batch = pad_sequences(X_batch, maxlen=self.max_len, dtype='float32')
X_batch_indexes = pad_sequences(X_batch_indexes, maxlen=self.max_len, dtype='int32')
y_batch = to_categorical(y_batch, num_classes=len(self.token2index))
yield np.array(X_batch), np.array(X_batch_indexes), np.array(y_batch)
finished_batch = True
break
x_vec.append(token_vec)
x_tokens.append(self.token2index[token])
, , , 100 32, :
data_generator = EmbedingsDataGenerator()
%%timeit
for x, y in data_generator:
pass
448 ms ± 65.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
data_generator = IndexesDataGenerator()
%%timeit
for x_e, x_i, y_i in data_generator:
pass
5.77 s ± 115 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3. Languade Models
. , .
: 20 5.
def similarity_values_wrapper(embedder, vectorizer, distance_function=cosine_distances):
def get_similarity_values(sentences):
sent_vec = vectorizer(sentences)
sent_embedings = embedder(sent_vec)
distances = distance_function(sent_embedings, sent_embedings)
return distances
return get_similarity_values
3.1 Language Model on embedings
def model_builder(data_generator):
complexity = 500
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X = LSTM(complexity, return_sequences=True)(X)
X = LSTM(complexity)(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(data_generator.embedding_size, activation='linear')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss=cosine_similarity, optimizer='adam')
model.summary()
return model
data_generator = EmbedingsDataGenerator(use_fasttext=False)
next_word_model = model_builder(data_generator)
get_similarity_values = similarity_values_wrapper(next_word_model.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'Language Model on embedings')
new_result = parse_result(new_result)
print(i, new_result)
# stopping condition
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
next_word_model.train_on_batch(x, y)
0 1644.6
3 148.7
6 274.8
9 72.3
12 186.8
15 183.7
18 415.8
21 138.9
3.2 Language Model on token index
def model_builder(data_generator):
complexity = 200
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X = LSTM(complexity, return_sequences=True)(X)
X = LSTM(complexity)(X)
X = Dense(complexity, activation='linear', name='embedding_output')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(len(data_generator.token2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embedding_output').output)
return model, embedder
data_generator = IndexesDataGenerator()
next_word_model, embedder = model_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'Language Model on token index')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x_e, x_i, y in data_generator:
next_word_model.train_on_batch(x_e, y)
0 1700.6
3 404.7
6 255.3
9 379.8
12 195.2
15 160.1
18 530.7
21 701.9
24 536.9
plot_results()