asreview · jteijema · Nov 22, 2021 · J535D165 · Nov 28, 2021
diff --git a/asreviewcontrib/semantic_clustering/semantic_clustering.py b/asreviewcontrib/semantic_clustering/semantic_clustering.py
@@ -56,19 +56,20 @@ def run_clustering_steps(
 
     # tokenize abstracts and add to data
     print("Tokenizing abstracts...")
-    encoded = tokenizer.batch_encode_plus(
-        data['abstract'].tolist(),
-        add_special_tokens=False,
-        truncation=True,
-        max_length=200,
-        padding='max_length',
-        return_tensors='pt')
+    encoded = data['abstract'].progress_apply(
+        lambda x: tokenizer.encode_plus(
+            x,
+            add_special_tokens=False,
+            truncation=True,
+            max_length=512,
+            # padding='max_length',
+            return_tensors='pt'))
 
     # generate embeddings and format correctly
     print("Generating embeddings...")
     embeddings = []
-    for x in tqdm(encoded.input_ids):
-        embeddings.append(model(x.unsqueeze(0), output_hidden_states=False)[-1].detach().numpy().squeeze())  # noqa: E501
+    for x in tqdm(encoded):
+        embeddings.append(model(**x, output_hidden_states=False)[-1].detach().numpy().squeeze())  # noqa: E501
 
     # from here on the data is not directly attached to the dataframe anymore,
     # as a result of legacy code. This will be fixed in a future PR.