您好,关于文章创建句子嵌入方法的文章的续篇。本指南只有很少的单词和大量代码,可用于Ctrl + c,Ctrl + v,改进和进一步的测试。
第一部分是必读的
4. BERT
from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
# http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html
4.1 rubert_cased_L-12_H-768_A-12_pt
class RU_BERT_CLASS:
def __init__(self, name):
bert_config = read_json(configs.embedder.bert_embedder)
bert_config['metadata']['variables']['BERT_PATH'] = os.path.join('./.', name)
self.m = build_model(bert_config)
def vectorizer(self, sentences):
return [sentence.split() for sentence in sentences]
def predict(self, tokens):
_, _, _, _, sent_max_embs, sent_mean_embs, _ = self.m(tokens)
return sent_mean_embs
bert = RU_BERT_CLASS('rubert_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'rubert')
鲁伯特:2895.7
4.2 ru_conversational_cased_L-12_H-768_A-12_pt
bert = RU_BERT_CLASS('ru_conversational_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'ru_conversational')
'ru_conversational:3559.1'
4.3句子_ru_cased_L-12_H-768_A-12_pt
bert = RU_BERT_CLASS('sentence_ru_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'sentence_ru')
'sentence_ru:2660.2'
4.4 elmo_ru-news_wmt11-16_1.5M_steps
class ELMO_CLASS(RU_BERT_CLASS):
def __init__(self, name):
self.m = ELMoEmbedder(f"http://files.deeppavlov.ai/deeppavlov_data/{name}")
def predict(self, tokens):
return self.m(tokens)
elmo = ELMO_CLASS('elmo_ru-news_wmt11-16_1.5M_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-news')
'elmo_ru-news: 4631.3'
4.5 elmo_ru-wiki_600k_steps
elmo = ELMO_CLASS('elmo_ru-wiki_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-wiki')
'elmo_ru-wiki: 4507.6'
4.6 elmo_ru-twitter_2013-01_2018-04_600k_steps
elmo = ELMO_CLASS('elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-twitter')
'elmo_ru-twitter: 2962.2'
plot_results()
5.
, , .
5.1 embedings -> embedings
def models_builder(data_generator):
def cosine_loss(y_true, y_pred):
return K.mean(cosine_similarity(y_true, y_pred, axis=-1))
complexity = 300
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X)
X = Flatten()(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='linear', name='embeding_output')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(data_generator.max_len*complexity, activation='elu')(X)
X = Reshape((data_generator.max_len, complexity))(X)
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Dense(data_generator.embedding_size, activation='elu')(X)
autoencoder = Model(inputs=inp, outputs=X)
autoencoder.compile(loss=cosine_loss, optimizer='adam')
autoencoder.summary()
embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
return autoencoder, embedder
data_generator = EmbedingsDataGenerator(use_fasttext=False)
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize, distance_function=cosine_distances)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, ' embedings -> embedings')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
autoencoder.train_on_batch(x, x)
0 1770.2
3 212.6
6 138.8
9 84.8
12 78.1
15 106.4
18 112.7
21 79.7
5.2 embedings -> indexes
def models_builder(data_generator):
complexity = 300
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X)
X = Flatten()(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='linear', name='embeding_output')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(data_generator.max_len*complexity, activation='elu')(X)
X = Reshape((data_generator.max_len, complexity))(X)
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Dense(len(data_generator.token2index), activation='softmax')(X)
autoencoder = Model(inputs=inp, outputs=X)
autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
autoencoder.summary()
embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
return autoencoder, embedder
data_generator = IndexesDataGenerator()
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, ' embedings -> indexes')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x_e, x_i, y_i in data_generator:
autoencoder.train_on_batch(x_e, x_i)
0 1352.9
3 43.6
6 41.7
9 8.1
12 -5.6
15 43.1
18 36.1
21 -3.7
5.3 LSTM -> LSTM
def models_builder(data_generator):
def cosine_loss(y_true, y_pred):
return K.mean(cosine_similarity(y_true, y_pred, axis=-1))
complexity = 300
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X, state_h, state_c = LSTM(complexity, return_state=True)(X)
X = Concatenate()([state_h, state_c])
X = Dense(complexity, activation='linear', name='embeding_output')(X)
state_c = Dense(complexity, activation='linear')(X)
state_h = Dense(complexity, activation='linear')(X)
inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h])
X = Dense(data_generator.embedding_size, activation='linear')(X)
autoencoder = Model(inputs=[inp, inp_zeros], outputs=X)
autoencoder.compile(loss=cosine_loss, optimizer='adam')
autoencoder.summary()
embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
return autoencoder, embedder
data_generator = EmbedingsDataGenerator(use_fasttext=False)
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, ' embedings -> indexes')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
autoencoder.train_on_batch([x, zeros], x)
0 1903.6
3 1299.3
6 313.5
9 445.3
12 454.9
15 447.7
18 454.5
21 448.1
5.4 LSTM -> LSTM -> indexes
def models_builder(data_generator):
complexity = 300
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X, state_h, state_c = LSTM(complexity, return_state=True)(X)
X = Concatenate()([state_h, state_c])
X = Dense(complexity, activation='linear', name='embeding_output')(X)
state_c = Dense(complexity, activation='linear')(X)
state_h = Dense(complexity, activation='linear')(X)
inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h])
X = Dense(len(data_generator.token2index), activation='softmax')(X)
autoencoder = Model(inputs=[inp, inp_zeros], outputs=X)
autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
autoencoder.summary()
embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
return autoencoder, embedder
data_generator = IndexesDataGenerator()
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, ' LSTM -> LSTM -> indexes')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x_e, x_i, y_i in data_generator:
autoencoder.train_on_batch([x_e, zeros], x_i)
0 1903.6
3 1483.3
6 1249.3
9 566.3
12 789.2
15 702.3
18 480.5
21 552.3
24 533.0
6. Transfer Learning
TEXTS_CORPUS_WITH_LABEL = [(sentence, topic) for topic in texts_for_training for sentence in texts_for_training[topic]]
class BowDataGenerator(EmbedingsDataGenerator):
def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, batch_size=128, batches_per_epoch=100):
self.texts_topics = texts_topics
self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})}
self.batch_size = batch_size
self.batches_per_epoch = batches_per_epoch
self.count_vectorizer = CountVectorizer().fit([text_topic[0] for text_topic in self.texts_topics])
counts = Counter([text_topic[1] for text_topic in self.texts_topics])
self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts}
def vectorize(self, sentences):
return self.count_vectorizer.transform(sentences).toarray()
def __iter__(self):
for _ in tqdm(range(self.batches_per_epoch), leave=False):
X_batch = []
y_batch = []
finished_batch = False
while not finished_batch:
text, topic = random.choice(self.texts_topics)
X_batch.append(text)
y_batch.append(self.topic2index[topic])
if len(X_batch) >= self.batch_size:
X_batch = self.count_vectorizer.transform(X_batch).toarray()
y_batch = to_categorical(y_batch, num_classes=len(self.topic2index))
yield np.array(X_batch), np.array(y_batch)
finished_batch = True
data_generator = BowDataGenerator()
6.1 BOW
def models_builder(data_generator):
complexity = 500
inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
X = inp
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity, name='embeding_output')(X)
X = Activation('elu')(X)
X = Dense(len(data_generator.topic2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
return model, embedder
data_generator = BowDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, ' BOW')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
model.train_on_batch(x, y, class_weight=data_generator.class_weight)
0 601.4
3 1175.4
6 1187.0
9 1175.9
12 1097.9
15 1083.4
18 1083.8
21 1060.5
6.2 LSTM + MaxPooling (InferSent)
class LabelsDataGenerator(EmbedingsDataGenerator):
def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, target_len=20, batch_size=128, batches_per_epoch=100, use_word2vec=True, use_fasttext=True):
self.texts_topics = texts_topics
self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})}
self.target_len = target_len
self.batch_size = batch_size
self.batches_per_epoch = batches_per_epoch
self.use_word2vec = use_word2vec
self.use_fasttext = use_fasttext
self.embedding_size = len(vectorize('token', use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext))
counts = Counter([text_topic[1] for text_topic in self.texts_topics])
self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts}
def vectorize(self, sentences):
vectorized = []
for text in sentences:
tokens = str(text).split()
x_vec = []
for token in tokens:
token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)
x_vec.append(token_vec)
vectorized.append(x_vec)
vectorized = pad_sequences(vectorized, maxlen=self.target_len)
return vectorized
def __iter__(self):
for _ in tqdm(range(self.batches_per_epoch), leave=False):
X_batch = []
y_batch = []
finished_batch = False
while not finished_batch:
text, topic = random.choice(self.texts_topics)
tokens = text.split()
x_vec = []
for token in tokens:
token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)
if len(x_vec) >= self.target_len:
X_batch.append(x_vec)
y_batch.append(self.topic2index[topic])
if len(X_batch) >= self.batch_size:
break
x_vec.append(token_vec)
else:
X_batch.append(x_vec)
y_batch.append(self.topic2index[topic])
if len(X_batch) >= self.batch_size:
X_batch = pad_sequences(X_batch, maxlen=self.target_len)
y_batch = to_categorical(y_batch, num_classes=len(self.topic2index))
yield np.array(X_batch), np.array(y_batch)
finished_batch = True
def models_builder(data_generator):
complexity = 768
inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
X = inp
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Permute((2,1))(X)
X = MaxPooling1D(pool_size=600)(X)
X = Flatten()(X)
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity, name='embeding_output')(X)
X = Activation('sigmoid')(X)
X = Dense(len(data_generator.topic2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
return model, embedder
data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, ' LSTM + MaxPooling')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
model.train_on_batch(x, y, class_weight=data_generator.class_weight)
0 87.0
3 152.1
6 110.5
9 146.7
12 166.2
15 79.8
18 47.2
21 84.0
24 144.8
27 83.8
6.3 LSTM + Conv1D + AveragePooling
def models_builder(data_generator):
complexity = 600
inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
X_R = inp
X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R)
X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R)
X_C = inp
X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C)
X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C)
X = Concatenate()([X_R, X_C])
X = AveragePooling1D(pool_size=2)(X)
X = Conv1D(complexity, 3, strides=1, padding='same')(X)
X = AveragePooling1D(pool_size=2)(X)
X = Conv1D(complexity, 3, strides=1, padding='same')(X)
X = AveragePooling1D(pool_size=2)(X)
X = Flatten()(X)
X = Dense(complexity)(X)
X = Activation('sigmoid')(X)
X = Dense(complexity, name = 'embeding_output')(X)
X = Activation('elu')(X)
X = Dense(len(data_generator.topic2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
return model, embedder
data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
0 353.8
3 -147.8
6 7.6
9 5.5
12 -133.6
15 -133.6
18 9.0
21 9.0
24 -133.6
6.4 LSTM + Inception + Attention
def models_builder(data_generator):
rate = 0.20
complexity = 500
def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0):
X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X)
X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X)
X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X)
X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X)
X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X)
X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X)
X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X)
X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1])
X = Activation('elu')(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
return X
def bi_LSTM(X, complexity, rate=0.2, regularizer=0):
X = Bidirectional(LSTM(int(complexity/2), return_sequences=True))(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
return X
def dense_layer(X, complexity, activation='elu', rate=0.2, regularizer=0, name=None):
X = Dense(int(complexity), name=name)(X)
X = Activation(activation)(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
return X
inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
X = inp
X = inception_convolutional_layer(X, complexity)
X = inception_convolutional_layer(X, complexity)
X = inception_convolutional_layer(X, complexity)
X = MaxPooling1D(pool_size=2)(X)
X = inception_convolutional_layer(X, complexity)
X = MaxPooling1D(pool_size=2)(X)
X = inception_convolutional_layer(X, complexity)
X = MaxPooling1D(pool_size=2)(X)
R = inp
R = bi_LSTM(R, complexity)
R = bi_LSTM(R, complexity/2)
attention_probs = Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R)
R = multiply([R, attention_probs], name='attention_mul')
R = Dropout(rate)(R)
R = MaxPooling1D(pool_size=2)(R)
R = inception_convolutional_layer(R, complexity)
R = MaxPooling1D(pool_size=2)(R)
R = inception_convolutional_layer(R, complexity)
R = MaxPooling1D(pool_size=2)(R)
X = Concatenate(axis=-1)([X, R])
X = Flatten()(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
X = dense_layer(X, complexity)
X = dense_layer(X, complexity, activation='sigmoid')
X = dense_layer(X, complexity, name='embeding_output')
X = Dense(len(data_generator.topic2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
return model, embedder
data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, ' LSTM + Inception + Attention')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
model.train_on_batch(x, y, class_weight=data_generator.class_weight)
0 275.0
3 126.8
6 173.9
9 155.5
12 168.4
15 287.2
18 382.8
21 303.4
plot_results()
7 Triplet loss
7.1 Triplet loss BOW
class TripletDataGeneratorIndexes(BowDataGenerator):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.database = {}
for text, topic in self.texts_topics:
if topic not in self.database:
self.database[topic] = []
self.database[topic].append(text)
# <5
sh_database = {}
for topic in self.database:
if len(self.database[topic]) > 5:
sh_database[topic] = self.database[topic]
self.database = sh_database
self.all_topics = [topic for topic in self.database]
def __iter__(self):
for _ in tqdm(range(self.batches_per_epoch), leave=False):
anchor = []
positive = []
negative = []
for _ in range(self.batch_size):
anchor_topic = random.choice(self.all_topics)
anchor_index = np.random.randint(len(self.database[anchor_topic]))
positive_index = np.random.randint(len(self.database[anchor_topic]))
while positive_index == anchor_index:
positive_index = np.random.randint(len(self.database[anchor_topic]))
negative_topic = random.choice(self.all_topics)
while negative_topic == anchor_topic:
negative_topic = random.choice(self.all_topics)
negative_index = np.random.randint(len(self.database[negative_topic]))
anchor.append(self.database[anchor_topic][anchor_index])
positive.append(self.database[anchor_topic][positive_index])
negative.append(self.database[negative_topic][negative_index])
yield self.vectorize(anchor), self.vectorize(positive), self.vectorize(negative)
def models_builder(data_generator):
sentence_embeding_size = 100
def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8):
"""
Implementation of the triplet loss function
Arguments:
y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
y_pred -- python list containing three objects:
anchor -- the encodings for the anchor data
positive -- the encodings for the positive data (similar to anchor)
negative -- the encodings for the negative data (different from anchor)
N -- The number of dimension
beta -- The scaling factor, N is recommended
epsilon -- The Epsilon value to prevent ln(0)
Returns:
loss -- real number, value of the loss
"""
anchor = tf.convert_to_tensor(y_pred[:,0:N])
positive = tf.convert_to_tensor(y_pred[:,N:N*2])
negative = tf.convert_to_tensor(y_pred[:,N*2:N*3])
# distance between the anchor and the positive
pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,positive)),1)
# distance between the anchor and the negative
neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,negative)),1)
#Non Linear Values
pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon)
neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon)
# compute loss
loss = neg_dist + pos_dist
return loss
def basic_sentence_vectorizer():
inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
X = inp
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity, name='embeding_output')(X)
X = Activation('elu')(X)
X = Dense(complexity)(X)
vectorizer = Model(inputs=inp, outputs=X)
return vectorizer
complexity = 300
inp_anchor = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
inp_positive = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
inp_negative = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
embedder = basic_sentence_vectorizer()
anchor = embedder(inp_anchor)
positive = embedder(inp_positive)
negative = embedder(inp_negative)
output = Concatenate(axis=1)([anchor, positive, negative])
model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output)
model.compile(optimizer='adagrad', loss=lossless_triplet_loss)
model.summary()
return model, embedder
data_generator = TripletDataGeneratorIndexes(batch_size=128, batches_per_epoch=10000)
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'triplet loss indexes')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for a, p, n in data_generator:
model.train_on_batch([a, p, n], zeros)
0 724.1
3 -143.5
6 11.7
9 36.2
12 -123.5
15 150.1
18 -51.9
21 5.0
24 -43.5
7.2 Triplet loss embedings
class TripletDataGeneratorEmbedings(TripletDataGeneratorIndexes):
def __init__(self, *args, **kwargs):
super().__init__()
self.target_len = kwargs['target_len']
self.embedding_size = len(vectorize('any_token'))
self.use_word2vec = True
self.use_fasttext = True
self.batches_per_epoch = kwargs['batches_per_epoch']
def vectorize(self, sentences):
return LabelsDataGenerator.vectorize(self, sentences)
def models_builder(data_generator):
sentence_embeding_size = 300
def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8):
"""
Implementation of the triplet loss function
Arguments:
y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
y_pred -- python list containing three objects:
anchor -- the encodings for the anchor data
positive -- the encodings for the positive data (similar to anchor)
negative -- the encodings for the negative data (different from anchor)
N -- The number of dimension
beta -- The scaling factor, N is recommended
epsilon -- The Epsilon value to prevent ln(0)
Returns:
loss -- real number, value of the loss
"""
anchor = tf.convert_to_tensor(y_pred[:,0:N])
positive = tf.convert_to_tensor(y_pred[:,N:N*2])
negative = tf.convert_to_tensor(y_pred[:,N*2:N*3])
# distance between the anchor and the positive
pos_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,positive)),1)
# distance between the anchor and the negative
neg_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,negative)),1)
#Non Linear Values
pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon)
neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon)
# compute loss
loss = neg_dist + pos_dist
return loss
def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0):
X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X)
X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X)
X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X)
X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X)
X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X)
X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X)
X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X)
X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1])
X = Activation('elu')(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
return X
def bi_LSTM(X, complexity, rate=0.2, regularizer=0):
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(int(complexity/2), return_sequences=True))(X)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dropout(rate)(X)
return X
def dense_layer(X, complexity, rate=0.2, regularizer=0):
X = tf.keras.layers.Dense(int(complexity))(X)
X = tf.keras.layers.Activation('elu')(X)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dropout(rate)(X)
return X
def basic_sentence_vectorizer():
rate = 0.20
complexity = 300
inp = Input(shape = (data_generator.target_len, data_generator.embedding_size))
X = inp
X = inception_convolutional_layer(X, complexity)
X = inception_convolutional_layer(X, complexity)
X = inception_convolutional_layer(X, complexity)
X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
X = inception_convolutional_layer(X, complexity)
X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
X = inception_convolutional_layer(X, complexity)
X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
R = inp
R = bi_LSTM(R, complexity)
R = bi_LSTM(R, complexity/2)
attention_probs = tf.keras.layers.Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R)
R = multiply([R, attention_probs], name='attention_mul')
R = Dropout(rate)(R)
R = MaxPooling1D(pool_size=2)(R)
R = inception_convolutional_layer(R, complexity)
R = MaxPooling1D(pool_size=2)(R)
R = inception_convolutional_layer(R, complexity)
R = MaxPooling1D(pool_size=2)(R)
X = Concatenate(axis=-1)([X, R])
X = Flatten()(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
X = dense_layer(X, complexity)
X = dense_layer(X, complexity)
X = dense_layer(X, complexity)
X = Dense(sentence_embeding_size, activation='sigmoid')(X)
vectorizer = Model(inputs=inp, outputs=X)
return vectorizer
inp_anchor = Input(shape = (data_generator.target_len, data_generator.embedding_size))
inp_positive = Input(shape = (data_generator.target_len, data_generator.embedding_size))
inp_negative = Input(shape = (data_generator.target_len, data_generator.embedding_size))
embedder = basic_sentence_vectorizer()
anchor = embedder(inp_anchor)
positive = embedder(inp_positive)
negative = embedder(inp_negative)
output = Concatenate(axis=1)([anchor, positive, negative])
model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output)
model.compile(optimizer='adagrad', loss=lossless_triplet_loss)
model.summary()
return model, embedder
data_generator = TripletDataGeneratorEmbedings(target_len=20, batch_size=32, batches_per_epoch=10000)
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'triplet loss embeding')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i>20:
break
for a, p, n in data_generator:
model.train_on_batch([a, p, n], zeros)
0 283.9
3 334.2
6 218.1
9 219.6
12 262.8
15 282.4
18 289.7
21 274.9
plot_results()
, ELMO .. . , .
BOW . , .
. , . , . , .
Triplet loss embedings . , 100 .
两种方法:BOW不带停用词的词缀和权重tf-idf的均值,尽管它们未给出出色的平均结果,但对于某些句子却给出了非常非常好的结果。因此,对于这些方法,一切都必须取决于数据。
如果我收集了足够的想法,很可能会及时发布第3部分。