-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_to_embeddings.py
60 lines (42 loc) · 1.4 KB
/
convert_to_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#This is to turn the code into text embeddings using gensim
#Might work
from collections import defaultdict
import numpy as np
import pickle
import gensim
from gensim.models.keyedvectors import KeyedVectors
#Instead of using Justin's break, one can use the function that was created
"""
from gensim.models import KeyedVectors
path = get_tmpfile("wordvectors.kv")
model.wv.save(path)
wv = KeyedVectors.load("model.wv", mmap='r')
vector = wv['computer'] # numpy vector of a word
Will deivide the text into sentences
return [sent for sent in document.sents]
"""
#an array of sentences
#an array of embeddings
path = r".\glove.6B.50d.txt.w2v"
glove = KeyedVectors.load_word2vec_format(path, binary=False)
def to_embeddings(arr_statements):
#return a tensor - retrieve the underlying NumPy array.
#my_vectors = model.encode(data).data
"""
:param
text:string of our captions
Take in M things (1, to a (1,50) Then put them together
:return
vectors: a (50,)np.ndarray that represents out caption in semantic space
"""
m = arr_statements.shape()[0]
main_vector = np.zeros(m,)
for sentence in arr_statements:
vectors = np.zeros((50,))
if sentence not in glove:
pass
else:
word_vector = glove[sentence]
vectors += word_vector
main_vector = np.append(main_vector, vectors)
return vectors.reshape((m, 50))