Lightning tour

The following examples and code snippets give you an overview of spaCy's functionality and its usage. If you're new to spaCy, make sure to check out the spaCy 101 guide.

Install models and process text

python -m spacy download en
python -m spacy download de
import spacy
nlp = spacy.load('en')
doc = nlp(u'Hello, world. Here are two sentences.')

nlp_de = spacy.load('de')
doc_de = nlp_de(u'Ich bin ein Berliner.')

Get tokens, noun chunks & sentences

doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
          u"emoji. It's outranking eggplant 🍑 ")

assert doc[0].text == u'Peach'
assert doc[1].text == u'emoji'
assert doc[-1].text == u'🍑'
assert doc[17:19].text == u'outranking eggplant'
assert list(doc.noun_chunks)[0].text == u'Peach emoji'

sentences = list(doc.sents)
assert len(sentences) == 3
assert sentences[1].text == u'Peach is the superior emoji.'

Get part-of-speech tags and flags

doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0]
assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579]
assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553]
assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862]
assert apple.is_alpha == True
assert apple.is_punct == False

billion = doc[10]
assert billion.is_digit == False
assert billion.like_num == True
assert billion.like_email == False

Use hash values for any string

doc = nlp(u'I love coffee')
coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'

assert doc[2].orth == coffee_hash == 3197928453018144401
assert doc[2].text == coffee_text == u'coffee'

beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079
beer_text = doc.vocab.strings[beer_hash] # 'beer'

unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '

Recongnise and update named entities

doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
assert ents == [(u'San Francisco', 0, 13, u'GPE')]

from spacy.tokens import Span
doc = nlp(u'Netflix is hiring a new VP of global policy')
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
assert ents == [(0, 7, u'ORG')]

Visualize a dependency parse and named entities in your browser

from spacy import displacy

doc_dep = nlp(u'This is a sentence.')
displacy.serve(doc_dep, style='dep')

doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
              u'in 2007, few people outside of the company took him seriously.')
displacy.serve(doc_ent, style='ent')

Get word vectors and similarity

doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]
assert apple.similarity(banana) > pasta.similarity(hippo)
assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector

Simple and efficient serialization

import spacy
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab

nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r').read()
doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin')

new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')

Match text with token rules

import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)

def set_sentiment(matcher, doc, i, matches):
    doc.sentiment += 0.1

pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji
matches = nlp(LOTS_OF TEXT)

Multi-threaded generator

texts = [u'One document.', u'...', u'Lots of documents']
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in xrange(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
    assert doc.is_parsed
    if i == 100:
        break

Get syntactic dependencies

def dependency_labels_to_root(token):
    """Walk up the syntactic tree, collecting the arc labels."""
    dep_labels = []
    while token.head is not token:
        dep_labels.append(token.dep)
        token = token.head
    return dep_labels

Export to numpy arrays

from spacy.attrs import ORTH, LIKE_URL, IS_OOV

attr_ids = [ORTH, LIKE_URL, IS_OOV]
doc_array = doc.to_array(attr_ids)
assert doc_array.shape == (len(doc), len(attr_ids))
assert doc[0].orth == doc_array[0, 0]
assert doc[1].orth == doc_array[1, 0]
assert doc[0].like_url == doc_array[0, 1]
assert list(doc_array[:, 1]) == [t.like_url for t in doc]

Calculate inline markup on original string

def put_spans_around_tokens(doc, get_classes):
    """Given some function to compute class names, put each token in a
    span element, with the appropriate classes computed. All whitespace is
    preserved, outside of the spans. (Of course, HTML won't display more than
    one whitespace character it – but the point is, no information is lost
    and you can calculate what you need, e.g. <br />, <p> etc.)
    """
    output = []
    html = '<span class="{classes}">{word}</span>{space}'
    for token in doc:
        if token.is_space:
            output.append(token.text)
        else:
            classes = ' '.join(get_classes(token))
            output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
    string = ''.join(output)
    string = string.replace('\n', '')
    string = string.replace('\t', '    ')
    return string