创建提案嵌入的方法概述,第2部分

您好,关于文章创建句子嵌入方法的文章的续篇。本指南只有很少的单词和大量代码,可用于Ctrl + c,Ctrl + v,改进和进一步的测试。



第一部分是必读的



4. BERT



from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
#     http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html


4.1 rubert_cased_L-12_H-768_A-12_pt



class RU_BERT_CLASS:
    def __init__(self, name):
        bert_config = read_json(configs.embedder.bert_embedder)
        bert_config['metadata']['variables']['BERT_PATH'] = os.path.join('./.', name)
        self.m = build_model(bert_config)

    def vectorizer(self, sentences):
        return [sentence.split() for sentence in sentences]

    def predict(self, tokens):
        _, _, _, _, sent_max_embs, sent_mean_embs, _ = self.m(tokens)
        return sent_mean_embs

bert = RU_BERT_CLASS('rubert_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'rubert')


鲁伯特:2895.7



4.2 ru_conversational_cased_L-12_H-768_A-12_pt



bert = RU_BERT_CLASS('ru_conversational_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'ru_conversational')


'ru_conversational:3559.1'



4.3句子_ru_cased_L-12_H-768_A-12_pt



bert = RU_BERT_CLASS('sentence_ru_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'sentence_ru')


'sentence_ru:2660.2'



4.4 elmo_ru-news_wmt11-16_1.5M_steps



class ELMO_CLASS(RU_BERT_CLASS):
    def __init__(self, name):
        self.m = ELMoEmbedder(f"http://files.deeppavlov.ai/deeppavlov_data/{name}")

    def predict(self, tokens):
        return self.m(tokens)


elmo = ELMO_CLASS('elmo_ru-news_wmt11-16_1.5M_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-news')


'elmo_ru-news: 4631.3'



4.5 elmo_ru-wiki_600k_steps



elmo = ELMO_CLASS('elmo_ru-wiki_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-wiki')


'elmo_ru-wiki: 4507.6'



4.6 elmo_ru-twitter_2013-01_2018-04_600k_steps



elmo = ELMO_CLASS('elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-twitter')


'elmo_ru-twitter: 2962.2'



plot_results()


png



5.



, , .



5.1 embedings -> embedings



def models_builder(data_generator):
    def cosine_loss(y_true, y_pred):
        return K.mean(cosine_similarity(y_true, y_pred, axis=-1))

    complexity = 300
    inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
    X = inp
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X)
    X = Flatten()(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(complexity, activation='linear', name='embeding_output')(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(data_generator.max_len*complexity, activation='elu')(X)
    X = Reshape((data_generator.max_len, complexity))(X)
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Dense(data_generator.embedding_size, activation='elu')(X)
    autoencoder = Model(inputs=inp, outputs=X)
    autoencoder.compile(loss=cosine_loss, optimizer='adam')
    autoencoder.summary()

    embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
    return autoencoder, embedder

data_generator = EmbedingsDataGenerator(use_fasttext=False)
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize, distance_function=cosine_distances)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, ' embedings -> embedings')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        autoencoder.train_on_batch(x, x)


0 1770.2

3 212.6

6 138.8

9 84.8

12 78.1

15 106.4

18 112.7

21 79.7



5.2 embedings -> indexes



def models_builder(data_generator):
    complexity = 300
    inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
    X = inp
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X)
    X = Flatten()(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(complexity, activation='linear', name='embeding_output')(X)
    X = Dense(complexity, activation='elu')(X)
    X = Dense(data_generator.max_len*complexity, activation='elu')(X)
    X = Reshape((data_generator.max_len, complexity))(X)
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Dense(len(data_generator.token2index), activation='softmax')(X)
    autoencoder = Model(inputs=inp, outputs=X)
    autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    autoencoder.summary()

    embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
    return autoencoder, embedder

data_generator = IndexesDataGenerator()
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, ' embedings -> indexes')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x_e, x_i, y_i in data_generator:
        autoencoder.train_on_batch(x_e, x_i)


0 1352.9

3 43.6

6 41.7

9 8.1

12 -5.6

15 43.1

18 36.1

21 -3.7



5.3 LSTM -> LSTM



def models_builder(data_generator):
    def cosine_loss(y_true, y_pred):
        return K.mean(cosine_similarity(y_true, y_pred, axis=-1))

    complexity = 300
    inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
    X = inp
    X, state_h, state_c = LSTM(complexity, return_state=True)(X)
    X = Concatenate()([state_h, state_c])
    X = Dense(complexity, activation='linear', name='embeding_output')(X)

    state_c = Dense(complexity, activation='linear')(X)
    state_h = Dense(complexity, activation='linear')(X)
    inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size))

    X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h])
    X = Dense(data_generator.embedding_size, activation='linear')(X)

    autoencoder = Model(inputs=[inp, inp_zeros], outputs=X)
    autoencoder.compile(loss=cosine_loss, optimizer='adam')
    autoencoder.summary()

    embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
    return autoencoder, embedder

data_generator = EmbedingsDataGenerator(use_fasttext=False)
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, ' embedings -> indexes')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        autoencoder.train_on_batch([x, zeros], x)


0 1903.6

3 1299.3

6 313.5

9 445.3

12 454.9

15 447.7

18 454.5

21 448.1



5.4 LSTM -> LSTM -> indexes



def models_builder(data_generator):
    complexity = 300
    inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
    X = inp
    X, state_h, state_c = LSTM(complexity, return_state=True)(X)
    X = Concatenate()([state_h, state_c])
    X = Dense(complexity, activation='linear', name='embeding_output')(X)
    state_c = Dense(complexity, activation='linear')(X)
    state_h = Dense(complexity, activation='linear')(X)
    inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size))

    X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h])
    X = Dense(len(data_generator.token2index), activation='softmax')(X)

    autoencoder = Model(inputs=[inp, inp_zeros], outputs=X)
    autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    autoencoder.summary()

    embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
    return autoencoder, embedder

data_generator = IndexesDataGenerator()
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, '  LSTM -> LSTM -> indexes')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x_e, x_i, y_i in data_generator:
        autoencoder.train_on_batch([x_e, zeros], x_i)


0 1903.6

3 1483.3

6 1249.3

9 566.3

12 789.2

15 702.3

18 480.5

21 552.3

24 533.0





6. Transfer Learning



TEXTS_CORPUS_WITH_LABEL = [(sentence, topic) for topic in texts_for_training for sentence in texts_for_training[topic]]

class BowDataGenerator(EmbedingsDataGenerator):
    def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, batch_size=128, batches_per_epoch=100):
        self.texts_topics = texts_topics
        self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})}
        self.batch_size = batch_size
        self.batches_per_epoch = batches_per_epoch
        self.count_vectorizer = CountVectorizer().fit([text_topic[0] for text_topic in self.texts_topics])
        counts = Counter([text_topic[1] for text_topic in self.texts_topics])
        self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts}

    def vectorize(self, sentences):
        return self.count_vectorizer.transform(sentences).toarray()

    def __iter__(self):
        for _ in tqdm(range(self.batches_per_epoch), leave=False):
            X_batch = []
            y_batch = []
            finished_batch = False
            while not finished_batch:
                text, topic = random.choice(self.texts_topics)
                X_batch.append(text)
                y_batch.append(self.topic2index[topic])

                if len(X_batch) >= self.batch_size:
                    X_batch = self.count_vectorizer.transform(X_batch).toarray()
                    y_batch = to_categorical(y_batch, num_classes=len(self.topic2index))
                    yield np.array(X_batch), np.array(y_batch)
                    finished_batch = True

data_generator = BowDataGenerator()


6.1 BOW



def models_builder(data_generator):
    complexity = 500
    inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
    X = inp
    X = Dense(complexity)(X)
    X = Activation('elu')(X)
    X = Dense(complexity)(X)
    X = Activation('elu')(X)
    X = Dense(complexity, name='embeding_output')(X)
    X = Activation('elu')(X)
    X = Dense(len(data_generator.topic2index), activation='softmax')(X)

    model = Model(inputs=inp, outputs=X)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
    return model, embedder

data_generator = BowDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, '  BOW')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        model.train_on_batch(x, y, class_weight=data_generator.class_weight)


0 601.4

3 1175.4

6 1187.0

9 1175.9

12 1097.9

15 1083.4

18 1083.8

21 1060.5



6.2 LSTM + MaxPooling (InferSent)



:

Arxiv

-



class LabelsDataGenerator(EmbedingsDataGenerator):
    def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, target_len=20, batch_size=128, batches_per_epoch=100, use_word2vec=True, use_fasttext=True):
        self.texts_topics = texts_topics
        self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})}
        self.target_len = target_len
        self.batch_size = batch_size
        self.batches_per_epoch = batches_per_epoch
        self.use_word2vec = use_word2vec
        self.use_fasttext = use_fasttext
        self.embedding_size = len(vectorize('token', use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext))
        counts = Counter([text_topic[1] for text_topic in self.texts_topics])
        self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts}       

    def vectorize(self, sentences):
        vectorized = []
        for text in sentences:
            tokens = str(text).split()
            x_vec = []
            for token in tokens:
                token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)                       
                x_vec.append(token_vec)
            vectorized.append(x_vec)

        vectorized = pad_sequences(vectorized, maxlen=self.target_len)
        return vectorized

    def __iter__(self):
        for _ in tqdm(range(self.batches_per_epoch), leave=False):
            X_batch = []
            y_batch = []
            finished_batch = False
            while not finished_batch:
                text, topic = random.choice(self.texts_topics)
                tokens = text.split()
                x_vec = []
                for token in tokens:
                    token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)
                    if len(x_vec) >= self.target_len:
                        X_batch.append(x_vec)
                        y_batch.append(self.topic2index[topic])
                        if len(X_batch) >= self.batch_size:
                            break
                    x_vec.append(token_vec)
                else:
                    X_batch.append(x_vec)
                    y_batch.append(self.topic2index[topic])

                if len(X_batch) >= self.batch_size:
                    X_batch = pad_sequences(X_batch, maxlen=self.target_len)
                    y_batch = to_categorical(y_batch, num_classes=len(self.topic2index))
                    yield np.array(X_batch), np.array(y_batch)
                    finished_batch = True


def models_builder(data_generator):
    complexity = 768
    inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
    X = inp
    X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
    X = Permute((2,1))(X)
    X = MaxPooling1D(pool_size=600)(X)
    X = Flatten()(X)
    X = Dense(complexity)(X)
    X = Activation('elu')(X)
    X = Dense(complexity, name='embeding_output')(X)
    X = Activation('sigmoid')(X)
    X = Dense(len(data_generator.topic2index), activation='softmax')(X)

    model = Model(inputs=inp, outputs=X)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
    return model, embedder

data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, '  LSTM + MaxPooling')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        model.train_on_batch(x, y, class_weight=data_generator.class_weight)


0 87.0

3 152.1

6 110.5

9 146.7

12 166.2

15 79.8

18 47.2

21 84.0

24 144.8

27 83.8



6.3 LSTM + Conv1D + AveragePooling



def models_builder(data_generator):
    complexity = 600
    inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
    X_R = inp
    X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R)
    X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R)

    X_C = inp
    X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C)
    X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C)

    X = Concatenate()([X_R, X_C])
    X = AveragePooling1D(pool_size=2)(X)

    X = Conv1D(complexity, 3, strides=1, padding='same')(X)
    X = AveragePooling1D(pool_size=2)(X)

    X = Conv1D(complexity, 3, strides=1, padding='same')(X)
    X = AveragePooling1D(pool_size=2)(X)
    X = Flatten()(X)
    X = Dense(complexity)(X)
    X = Activation('sigmoid')(X)
    X = Dense(complexity, name = 'embeding_output')(X)
    X = Activation('elu')(X)
    X = Dense(len(data_generator.topic2index), activation='softmax')(X)

    model = Model(inputs=inp, outputs=X)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
    return model, embedder

data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


0 353.8

3 -147.8

6 7.6

9 5.5

12 -133.6

15 -133.6

18 9.0

21 9.0

24 -133.6



6.4 LSTM + Inception + Attention



def models_builder(data_generator):
    rate = 0.20
    complexity = 500

    def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0):
        X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X)
        X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X)
        X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X)
        X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X)
        X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X)
        X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X)
        X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X)
        X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1])
        X = Activation('elu')(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)
        return X

    def bi_LSTM(X, complexity, rate=0.2, regularizer=0):
        X = Bidirectional(LSTM(int(complexity/2), return_sequences=True))(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)
        return X

    def dense_layer(X, complexity, activation='elu', rate=0.2, regularizer=0, name=None):
        X = Dense(int(complexity), name=name)(X)
        X = Activation(activation)(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)
        return X

    inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
    X = inp
    X = inception_convolutional_layer(X, complexity)
    X = inception_convolutional_layer(X, complexity)
    X = inception_convolutional_layer(X, complexity)
    X = MaxPooling1D(pool_size=2)(X)
    X = inception_convolutional_layer(X, complexity)
    X = MaxPooling1D(pool_size=2)(X)
    X = inception_convolutional_layer(X, complexity)
    X = MaxPooling1D(pool_size=2)(X)

    R = inp
    R = bi_LSTM(R, complexity)
    R = bi_LSTM(R, complexity/2)
    attention_probs = Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R)
    R = multiply([R, attention_probs], name='attention_mul')
    R = Dropout(rate)(R)
    R = MaxPooling1D(pool_size=2)(R)
    R = inception_convolutional_layer(R, complexity)
    R = MaxPooling1D(pool_size=2)(R)
    R = inception_convolutional_layer(R, complexity)
    R = MaxPooling1D(pool_size=2)(R)

    X = Concatenate(axis=-1)([X, R])
    X = Flatten()(X)
    X = BatchNormalization()(X)
    X = Dropout(rate)(X)

    X = dense_layer(X, complexity)
    X = dense_layer(X, complexity, activation='sigmoid')
    X = dense_layer(X, complexity, name='embeding_output')

    X = Dense(len(data_generator.topic2index), activation='softmax')(X)

    model = Model(inputs=inp, outputs=X)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()

    embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
    return model, embedder

data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, '  LSTM + Inception + Attention')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for x, y in data_generator:
        model.train_on_batch(x, y, class_weight=data_generator.class_weight)


0 275.0

3 126.8

6 173.9

9 155.5

12 168.4

15 287.2

18 382.8

21 303.4



plot_results()


png



7 Triplet loss



, , , . , , , .

Triplet loss



7.1 Triplet loss BOW



class TripletDataGeneratorIndexes(BowDataGenerator):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.database = {}
        for text, topic in self.texts_topics:
            if topic not in self.database:
                self.database[topic] = []
            self.database[topic].append(text)
        #     <5  
        sh_database = {}
        for topic in self.database:
            if len(self.database[topic]) > 5:
                sh_database[topic] = self.database[topic]
        self.database = sh_database

        self.all_topics = [topic for topic in self.database]

    def __iter__(self):
        for _ in tqdm(range(self.batches_per_epoch), leave=False):
            anchor = []
            positive = []
            negative = []

            for _ in range(self.batch_size):
                anchor_topic = random.choice(self.all_topics)
                anchor_index = np.random.randint(len(self.database[anchor_topic]))
                positive_index = np.random.randint(len(self.database[anchor_topic]))
                while positive_index == anchor_index:
                    positive_index = np.random.randint(len(self.database[anchor_topic]))

                negative_topic = random.choice(self.all_topics)
                while negative_topic == anchor_topic:
                    negative_topic = random.choice(self.all_topics)

                negative_index = np.random.randint(len(self.database[negative_topic]))

                anchor.append(self.database[anchor_topic][anchor_index])
                positive.append(self.database[anchor_topic][positive_index])
                negative.append(self.database[negative_topic][negative_index])

            yield self.vectorize(anchor), self.vectorize(positive), self.vectorize(negative)


def models_builder(data_generator):
    sentence_embeding_size = 100
    def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8):
        """
        Implementation of the triplet loss function

        Arguments:
        y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
        y_pred -- python list containing three objects:
                anchor -- the encodings for the anchor data
                positive -- the encodings for the positive data (similar to anchor)
                negative -- the encodings for the negative data (different from anchor)
        N  --  The number of dimension 
        beta -- The scaling factor, N is recommended
        epsilon -- The Epsilon value to prevent ln(0)

        Returns:
        loss -- real number, value of the loss
        """
        anchor = tf.convert_to_tensor(y_pred[:,0:N])
        positive = tf.convert_to_tensor(y_pred[:,N:N*2]) 
        negative = tf.convert_to_tensor(y_pred[:,N*2:N*3])

        # distance between the anchor and the positive
        pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,positive)),1)
        # distance between the anchor and the negative
        neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,negative)),1)

        #Non Linear Values  
        pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon)
        neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon)

        # compute loss
        loss = neg_dist + pos_dist
        return loss

    def basic_sentence_vectorizer():
        inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
        X = inp
        X = Dense(complexity)(X)
        X = Activation('elu')(X)
        X = Dense(complexity)(X)
        X = Activation('elu')(X)
        X = Dense(complexity, name='embeding_output')(X)
        X = Activation('elu')(X)
        X = Dense(complexity)(X)
        vectorizer = Model(inputs=inp, outputs=X)
        return vectorizer

    complexity = 300

    inp_anchor = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
    inp_positive = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
    inp_negative = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))

    embedder = basic_sentence_vectorizer()

    anchor = embedder(inp_anchor)
    positive = embedder(inp_positive)
    negative = embedder(inp_negative)

    output = Concatenate(axis=1)([anchor, positive, negative])

    model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output)
    model.compile(optimizer='adagrad', loss=lossless_triplet_loss)
    model.summary()
    return model, embedder

data_generator = TripletDataGeneratorIndexes(batch_size=128, batches_per_epoch=10000)
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, 'triplet loss indexes')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i > 20:
            break
    for a, p, n in data_generator:
        model.train_on_batch([a, p, n], zeros)


0 724.1

3 -143.5

6 11.7

9 36.2

12 -123.5

15 150.1

18 -51.9

21 5.0

24 -43.5



7.2 Triplet loss embedings



class TripletDataGeneratorEmbedings(TripletDataGeneratorIndexes):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.target_len = kwargs['target_len']
        self.embedding_size = len(vectorize('any_token'))
        self.use_word2vec = True
        self.use_fasttext = True
        self.batches_per_epoch = kwargs['batches_per_epoch']

    def vectorize(self, sentences):
        return LabelsDataGenerator.vectorize(self, sentences)


def models_builder(data_generator):
    sentence_embeding_size = 300
    def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8):
        """
        Implementation of the triplet loss function

        Arguments:
        y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
        y_pred -- python list containing three objects:
                anchor -- the encodings for the anchor data
                positive -- the encodings for the positive data (similar to anchor)
                negative -- the encodings for the negative data (different from anchor)
        N  --  The number of dimension
        beta -- The scaling factor, N is recommended
        epsilon -- The Epsilon value to prevent ln(0)

        Returns:
        loss -- real number, value of the loss
        """
        anchor = tf.convert_to_tensor(y_pred[:,0:N])
        positive = tf.convert_to_tensor(y_pred[:,N:N*2])
        negative = tf.convert_to_tensor(y_pred[:,N*2:N*3])

        # distance between the anchor and the positive
        pos_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,positive)),1)
        # distance between the anchor and the negative
        neg_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,negative)),1)

        #Non Linear Values  
        pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon)
        neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon)

        # compute loss
        loss = neg_dist + pos_dist

        return loss

    def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0):
        X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X)
        X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X)
        X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X)
        X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X)
        X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X)
        X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X)
        X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X)
        X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1])
        X = Activation('elu')(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)
        return X

    def bi_LSTM(X, complexity, rate=0.2, regularizer=0):
        X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(int(complexity/2), return_sequences=True))(X)
        X = tf.keras.layers.BatchNormalization()(X)
        X = tf.keras.layers.Dropout(rate)(X)
        return X

    def dense_layer(X, complexity, rate=0.2, regularizer=0):
        X = tf.keras.layers.Dense(int(complexity))(X)
        X = tf.keras.layers.Activation('elu')(X)
        X = tf.keras.layers.BatchNormalization()(X)
        X = tf.keras.layers.Dropout(rate)(X)
        return X

    def basic_sentence_vectorizer():
        rate = 0.20
        complexity = 300
        inp = Input(shape = (data_generator.target_len, data_generator.embedding_size))

        X = inp
        X = inception_convolutional_layer(X, complexity)
        X = inception_convolutional_layer(X, complexity)
        X = inception_convolutional_layer(X, complexity)
        X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
        X = inception_convolutional_layer(X, complexity)
        X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
        X = inception_convolutional_layer(X, complexity)
        X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)

        R = inp
        R = bi_LSTM(R, complexity)
        R = bi_LSTM(R, complexity/2)
        attention_probs = tf.keras.layers.Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R)
        R = multiply([R, attention_probs], name='attention_mul')
        R = Dropout(rate)(R)
        R = MaxPooling1D(pool_size=2)(R)
        R = inception_convolutional_layer(R, complexity)
        R = MaxPooling1D(pool_size=2)(R)
        R = inception_convolutional_layer(R, complexity)
        R = MaxPooling1D(pool_size=2)(R)

        X = Concatenate(axis=-1)([X, R])
        X = Flatten()(X)
        X = BatchNormalization()(X)
        X = Dropout(rate)(X)

        X = dense_layer(X, complexity)
        X = dense_layer(X, complexity)
        X = dense_layer(X, complexity)

        X = Dense(sentence_embeding_size, activation='sigmoid')(X)
        vectorizer = Model(inputs=inp, outputs=X)
        return vectorizer

    inp_anchor = Input(shape = (data_generator.target_len, data_generator.embedding_size))
    inp_positive = Input(shape = (data_generator.target_len, data_generator.embedding_size))
    inp_negative = Input(shape = (data_generator.target_len, data_generator.embedding_size))

    embedder = basic_sentence_vectorizer()

    anchor = embedder(inp_anchor)
    positive = embedder(inp_positive)
    negative = embedder(inp_negative)

    output = Concatenate(axis=1)([anchor, positive, negative])

    model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output)
    model.compile(optimizer='adagrad', loss=lossless_triplet_loss)
    model.summary()
    return model, embedder

data_generator = TripletDataGeneratorEmbedings(target_len=20, batch_size=32, batches_per_epoch=10000)
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)


zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5
for i in tqdm(range(1000)):
    if i%3==0:
        previous_result = new_result
        new_result = evaluate(get_similarity_values, 'triplet loss embeding')
        new_result = parse_result(new_result)
        print(i, new_result)
        if new_result < previous_result and i>20:
            break
    for a, p, n in data_generator:
        model.train_on_batch([a, p, n], zeros)


0 283.9

3 334.2

6 218.1

9 219.6

12 262.8

15 282.4

18 289.7

21 274.9



plot_results()


png





, ELMO .. . , .



BOW . , .



. , . , . , .



Triplet loss embedings . , 100 .



两种方法:BOW不带停用词的词缀和权重tf-idf的均值,尽管它们未给出出色的平均结果,但对于某些句子却给出了非常非常好的结果。因此,对于这些方法,一切都必须取决于数据。



如果我收集了足够的想法,很可能会及时发布第3部分。




All Articles