import numpy as np from tqdm import tqdm processed_data = [] texts=df.text.to_list() titles=df.title.to_list() with nlp.select_pipes(enable=["tok2vec", "lemmatizer", "attribute_ruler"]): # pipe() jest wydajniejszy niż nlp() w pętli for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts)): valid_vectors = [t.vector for t in doc if not t.is_stop and t.is_alpha] if valid_vectors: # NumPy jest zoptymalizowany pod C, to będzie szybkie doc_vector = np.mean(valid_vectors, axis=0) else: doc_vector = np.zeros(nlp.vocab.vectors_length) processed_data.append({ "title": titles[len(processed_data)], "text": texts[len(processed_data)], "tokens": [t.lemma_.lower() for t in doc if not t.is_stop and t.is_alpha], "vector": doc_vector })