mytext = """I traveled to Afghanistan in 2015 to film a documentary about the United States drone war. When my production partner and I inquired about kidnapping insurance, we were told that it would cost more than $20,000 to cover the director of photography and me. We couldn’t afford such a high premium and declined the offer.""" import spacy nlp = spacy.load('en') doc = nlp(mytext)
#omit punctuation and extra spaces, #and make everything lowercase words = [i.text.lower() for i in doc if i.pos_ !='PUNCT' words = [i for i in words if i.pos_ != 'SPACE'] #we create trigrams, an empty list trigrams = [] for i in range(len(words)): # i here will be a number between 0 and 56 # tri is a list of three words: # the current word and the two words after it tri = words[i:i+3] # at the end of the list, words[i:i+3] will look # two words ahead and find 1. a two-word pair since # the second to last word has only one word after it # and 2. the very last word alone # since nothing comes after it # we want to ignore those two cases, so ... if len(tri)== 3: trigrams.append(tri) #trigrams is now a list of lists trigrams
#to do this we need nltk installed from nltk.util import ngrams trigrams=ngrams(words,3) # this will return a list of tuples # instead of a list of lists list(trigrams)
import nltk from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.nbest(bigram_measures.pmi, 10)
import nltk from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words, window_size=6) finder.nbest(bigram_measures.pmi, 10)