This document can be used to prepare text data for Pathfinder.

Code is sampled from Rohan-Paul-AI: "https://www.youtube.com/watch?v=30zPz5Xz-8g" and Goonmeet Bajaj (Ohio State, AI PhD Candidate).

Consider using Word2Vec for single words, and encode_plus might be useful. BertSentence could be good for longer phrases or paragraph format data.
Batch refers to feedings lots of data at once.

In [1]:
#!pip install transformers -q #install BERT package

In [2]:
import torch #check if pytorch is installed
#torch.version
from transformers import BertModel, BertTokenizer #import bert

In [3]:
model = BertModel.from_pretrained('bert-base-uncased') #instantiate pretrained model

#It may be worth trying different pretrained models.

In [4]:
testSentence = 'She is a MachineLearning Engineer and works in California' #sample data
tS2 = 'He is a DataScientist Engineer and works in Detroit'

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #instantiate pretrained tokenizer

In [6]:
tokens = tokenizer.tokenize(testSentence) #tokenize data
t2 = tokenizer.tokenize(tS2)

In [7]:
print(tokens) #visualize tokenized data

['she', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california']


In [8]:
len(tokens)
len(t2)

12

In [9]:
tokens = ['[CLS]'] + tokens + ['[SEP]'] #add CLS and SEP token to data.
t2 = ['[CLS]'] + t2 + ['[SEP]']

In [10]:
print(tokens)

['[CLS]', 'she', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california', '[SEP]']


In [11]:
len(tokens) #check token count, should be 2 more than last

14

In [12]:
tokens = tokens + ['[PAD]'] + ['[PAD]'] #add padding to get to 16 for BERT
t2 = t2 + ['[PAD]'] + ['[PAD]']

In [13]:
len(tokens) #Check LEN

16

In [14]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens] #attention masks help out model focus on certain data
attention_mask = [1 if i!= '[PAD]' else 0 for i in t2]

#This code will not work when vectors are different sizes. 
#I will need to create a function to pad each vector to equal length. 

In [15]:
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


In [16]:
token_ids = tokenizer.convert_tokens_to_ids(tokens) #convert to id values
token_ids2 = tokenizer.convert_tokens_to_ids(t2)

In [17]:
print(token_ids)
print(token_ids2)

[101, 2016, 2003, 1037, 3698, 19738, 6826, 2075, 3992, 1998, 2573, 1999, 2662, 102, 0, 0]
[101, 2002, 2003, 1037, 2951, 11020, 11638, 2923, 3992, 1998, 2573, 1999, 5626, 102, 0, 0]


In [18]:
token_ids = torch.tensor(token_ids).unsqueeze(0) #now feed the token ids and attention mask into the 
#model, First convert to tensor and unsqueeze
token_ids2 = torch.tensor(token_ids2).unsqueeze(0)

In [19]:
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

In [20]:
output = model(token_ids, attention_mask = attention_mask) #pass input to model
output2 = model(token_ids2, attention_mask = attention_mask)

In [22]:
output #display BERT output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1925,  0.1684, -0.4252,  ..., -0.2599,  0.3736,  0.0529],
         [ 0.2417, -0.2748, -0.4909,  ...,  0.1372,  0.3408, -0.4655],
         [-0.0871,  0.0837,  0.2605,  ..., -0.4635, -0.0462,  0.2621],
         ...,
         [ 0.6711, -0.0076, -0.3847,  ..., -0.1289, -0.5171, -0.8002],
         [-0.2731,  0.1098, -0.5440,  ...,  0.0314,  0.4467, -0.3448],
         [-0.2387,  0.0119, -0.4760,  ...,  0.4656,  0.5837, -0.3774]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9531, -0.4914, -0.8872,  0.9035,  0.8174, -0.2919,  0.9511,  0.4982,
         -0.7595, -1.0000, -0.6996,  0.9459,  0.9890,  0.4754,  0.9723, -0.8460,
         -0.1423, -0.7209,  0.4428, -0.7905,  0.7822,  1.0000,  0.2119,  0.4066,
          0.5813,  0.9923, -0.8380,  0.9670,  0.9746,  0.8324, -0.8227,  0.4136,
         -0.9931, -0.2821, -0.8860, -0.9961,  0.5261, -0.8722, -0.0915, -0.0950,
         -0.9237,  0.5106,  1.00

In [112]:
output[0].shape #batch size, sequence length (token amount), hidden size 768 dimensions?

torch.Size([1, 16, 768])

In [113]:
output[1]

tensor([[-0.9531, -0.4914, -0.8872,  0.9035,  0.8174, -0.2919,  0.9511,  0.4982,
         -0.7595, -1.0000, -0.6996,  0.9459,  0.9890,  0.4754,  0.9723, -0.8460,
         -0.1423, -0.7209,  0.4428, -0.7905,  0.7822,  1.0000,  0.2119,  0.4066,
          0.5813,  0.9923, -0.8380,  0.9670,  0.9746,  0.8324, -0.8227,  0.4136,
         -0.9931, -0.2821, -0.8860, -0.9961,  0.5261, -0.8722, -0.0915, -0.0950,
         -0.9237,  0.5106,  1.0000, -0.0830,  0.5382, -0.3140, -1.0000,  0.3774,
         -0.9557,  0.8998,  0.7947,  0.8279,  0.2756,  0.6581,  0.6064, -0.3369,
          0.0251,  0.1856, -0.3297, -0.7515, -0.6843,  0.4392, -0.8613, -0.9603,
          0.8838,  0.7763, -0.3041, -0.3105, -0.1854, -0.0969,  0.9726,  0.3039,
          0.0437, -0.8890,  0.6619,  0.2710, -0.7410,  1.0000, -0.5422, -0.9900,
          0.7010,  0.7629,  0.6910, -0.1635,  0.4133, -1.0000,  0.6527, -0.1595,
         -0.9959,  0.2069,  0.5956, -0.3285,  0.3339,  0.7146, -0.4482, -0.5843,
         -0.4799, -0.8426, -

In [23]:
output["last_hidden_state"][-1]

tensor([[-0.1925,  0.1684, -0.4252,  ..., -0.2599,  0.3736,  0.0529],
        [ 0.2417, -0.2748, -0.4909,  ...,  0.1372,  0.3408, -0.4655],
        [-0.0871,  0.0837,  0.2605,  ..., -0.4635, -0.0462,  0.2621],
        ...,
        [ 0.6711, -0.0076, -0.3847,  ..., -0.1289, -0.5171, -0.8002],
        [-0.2731,  0.1098, -0.5440,  ...,  0.0314,  0.4467, -0.3448],
        [-0.2387,  0.0119, -0.4760,  ...,  0.4656,  0.5837, -0.3774]],
       grad_fn=<SelectBackward0>)

In [115]:
#Questions
#where is the CLS data? I think in the last hidden layer.
#what are the bard layers?

In [27]:
#attempt at cosine spacial distance using CLS vectors
from scipy.spatial import distance
#disCos = distance.cosine(output[1], output2[1])
#disCos = distance.cosine(output["last_hidden_state"][-1], output2["last_hidden_state"][-1])
#disCos = distance.cosine(output["last_hidden_state"][-1].detach().numpy(), output2["last_hidden_state"][-1].detach().numpy())
disCos = distance.cosine(output[1].detach().numpy(), output2[1].detach().numpy())
print(disCos)

testSentence = 'She is a MachineLearning Engineer and works in California' #sample data
tS2 = 'He is a DataScientist Engineer and works in Detroit'

0.011377573013305664


https://research-collective.com/PFWeb/index.html