Word2vec

It can measure how close words and their meanings are to each other. For example, in these sentences, you can say the Cat 🐱 and the Pot 🍯 mean similar things.

Example Code

import numpy as np

def softmax(x):
    # Countermeasures against buffer overflow
    pre = x - np.max(x)
    # e ^ pre
    e_x = np.exp(pre)
    sum = e_x.sum(axis=1, keepdims=True)
    return e_x / sum

def cross_entropy(y, x):
    pre = y * np.log(x)
    return -np.sum(pre) / len(y)

class SoftmaxAndCrossEntropyLayer:
    def __init__(self, y):
        self.y = y

    def predict(self, x):
        return softmax(x)

    def forward(self, x):
        self.x = softmax(x)
        return cross_entropy(self.y, self.x)

    def backward(self):
        # Differentiation of Cross Entropy + Softmax
        dX = self.x - self.y
        return dX

class LinearLayer:
    def __init__(self, lr, w, b):
        self.lr = lr
        self.w = w
        self.b = b

    def forward(self, x):
        self.x = x
        return np.dot(self.x, self.w) + self.b

    def backward(self, prev_d_x):
        # Partial derivative with respect to "x"
        dX = np.dot(prev_d_x, self.w.T)
        # Partial derivative with respect to "w"
        dW = np.dot(self.x.T, prev_d_x)
        # Partial derivative with respect to "b"
        dB = np.sum(prev_d_x, axis=0)

        self.w -= self.lr * dW
        self.b -= self.lr * dB

        return dX

def to_one_hot(id, vocab_size):
    x = np.zeros(vocab_size)
    x[id] = 1
    return x

# Skip-Gram

def generate_training_data(tokens, window_size):
    x, y = [], []

    tokens_size = len(tokens)

    for i in range(tokens_size):
        s = max(0,           i - window_size)
        e = min(tokens_size, i + window_size + 1)

        for j in range(s, e):
            if i == j:
                continue

            x.append(to_one_hot(word_to_id[tokens[i]], len(word_to_id)))
            y.append(to_one_hot(word_to_id[tokens[j]], len(word_to_id)))

    return np.array(x), np.array(y)

np.random.seed(0)

epochs = 10000
lr = 0.01
embed_size = 2
window_size = 1

tokens = []

word_to_id = {}
id_to_word = {}

text = "Cat is cute Cat is cool Cat is good Pot is cute Pot is cool Pot is good"

tokens = text.split()

words = set(tokens)

words_size = len(words)

for i, word in enumerate(words):
    word_to_id[word] = i
    id_to_word[i] = word

x0, y0 = generate_training_data(tokens, window_size)

w_to_e_liner_layer = LinearLayer(lr, np.random.randn(
    words_size, embed_size), np.random.rand(embed_size))
e_to_w_liner_layer = LinearLayer(lr, np.random.randn(
    embed_size, words_size), np.random.rand(words_size))
softmax_and_cross_entropy_layer = SoftmaxAndCrossEntropyLayer(y0)

for epoch in range(epochs):
    x1 = w_to_e_liner_layer.forward(x0)
    x2 = e_to_w_liner_layer.forward(x1)

    l1 = softmax_and_cross_entropy_layer.forward(x2)

    if epoch % 100 == 0:
        print(l1)

    d1 = softmax_and_cross_entropy_layer.backward()
    d2 = e_to_w_liner_layer.backward(d1)
    d3 = w_to_e_liner_layer.backward(d2)

print(word_to_id)
print(w_to_e_liner_layer.w)  # the Cat and the Pot are close to each other

Note

We use input data that converts the text to one-hot encoding. We use the Skip-Gram model.

def generate_training_data(tokens, window_size):
    x, y = [], []

    tokens_size = len(tokens)

    for i in range(tokens_size):
        s = max(0,           i - window_size)
        e = min(tokens_size, i + window_size + 1)

        for j in range(s, e):
            if i == j:
                continue

            x.append(to_one_hot(word_to_id[tokens[i]], len(word_to_id)))
            y.append(to_one_hot(word_to_id[tokens[j]], len(word_to_id)))

    return np.array(x), np.array(y)

So, how do we predict the correct answer from the input data?

We will use weights and biases.

def forward(self, x):
    self.x = x
    return np.dot(self.x, self.w) + self.b

It is defined by random values.

liner_layer = LinerLayer(lr, np.random.randn(4, 2), np.random.rand(2))

Additionally, the input data needs to be activated.

def softmax(x):
    # Countermeasures against buffer overflow
    pre = x - np.max(x)
    # e ^ pre
    e_x = np.exp(pre)
    sum = e_x.sum(axis=1, keepdims=True)
    return e_x / sum

So, we calculate the difference between the input data multiplied by the weights and added biases, and the one-hot encoded answer.

def cross_entropy(y, x):
    pre = y * np.log(x)
    return -np.sum(pre) / len(y)

After calculating the difference, perform differential calculations to determine the value and direction of the deviation in the difference. Then, update the weights and biases accordingly.

def backward(self):
    # Differentiation of Cross Entropy + Softmax
    dX = self.x - self.y
    return dX