diff options
Diffstat (limited to 'ml/dlib/python_examples/sequence_segmenter.py')
-rwxr-xr-x | ml/dlib/python_examples/sequence_segmenter.py | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/ml/dlib/python_examples/sequence_segmenter.py b/ml/dlib/python_examples/sequence_segmenter.py new file mode 100755 index 00000000..335e475f --- /dev/null +++ b/ml/dlib/python_examples/sequence_segmenter.py @@ -0,0 +1,197 @@ +#!/usr/bin/python +# The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt +# +# This example shows how to use dlib to learn to do sequence segmentation. In +# a sequence segmentation task we are given a sequence of objects (e.g. words in +# a sentence) and we are supposed to detect certain subsequences (e.g. the names +# of people). Therefore, in the code below we create some very simple training +# sequences and use them to learn a sequence segmentation model. In particular, +# our sequences will be sentences represented as arrays of words and our task +# will be to learn to identify person names. Once we have our segmentation +# model we can use it to find names in new sentences, as we will show. +# +# COMPILING/INSTALLING THE DLIB PYTHON INTERFACE +# You can install dlib using the command: +# pip install dlib +# +# Alternatively, if you want to compile dlib yourself then go into the dlib +# root folder and run: +# python setup.py install +# or +# python setup.py install --yes USE_AVX_INSTRUCTIONS +# if you have a CPU that supports AVX instructions, since this makes some +# things run faster. +# +# Compiling dlib should work on any operating system so long as you have +# CMake installed. On Ubuntu, this can be done easily by running the +# command: +# sudo apt-get install cmake +# +import sys +import dlib + + +# The sequence segmentation models we work with in this example are chain +# structured conditional random field style models. Therefore, central to a +# sequence segmentation model is some method for converting the elements of a +# sequence into feature vectors. That is, while you might start out representing +# your sequence as an array of strings, the dlib interface works in terms of +# arrays of feature vectors. Each feature vector should capture important +# information about its corresponding element in the original raw sequence. So +# in this example, since we work with sequences of words and want to identify +# names, we will create feature vectors that tell us if the word is capitalized +# or not. In our simple data, this will be enough to identify names. +# Therefore, we define sentence_to_vectors() which takes a sentence represented +# as a string and converts it into an array of words and then associates a +# feature vector with each word. +def sentence_to_vectors(sentence): + # Create an empty array of vectors + vects = dlib.vectors() + for word in sentence.split(): + # Our vectors are very simple 1-dimensional vectors. The value of the + # single feature is 1 if the first letter of the word is capitalized and + # 0 otherwise. + if word[0].isupper(): + vects.append(dlib.vector([1])) + else: + vects.append(dlib.vector([0])) + return vects + + +# Dlib also supports the use of a sparse vector representation. This is more +# efficient than the above form when you have very high dimensional vectors that +# are mostly full of zeros. In dlib, each sparse vector is represented as an +# array of pair objects. Each pair contains an index and value. Any index not +# listed in the vector is implicitly associated with a value of zero. +# Additionally, when using sparse vectors with dlib.train_sequence_segmenter() +# you can use "unsorted" sparse vectors. This means you can add the index/value +# pairs into your sparse vectors in any order you want and don't need to worry +# about them being in sorted order. +def sentence_to_sparse_vectors(sentence): + vects = dlib.sparse_vectors() + has_cap = dlib.sparse_vector() + no_cap = dlib.sparse_vector() + # make has_cap equivalent to dlib.vector([1]) + has_cap.append(dlib.pair(0, 1)) + + # Since we didn't add anything to no_cap it is equivalent to + # dlib.vector([0]) + for word in sentence.split(): + if word[0].isupper(): + vects.append(has_cap) + else: + vects.append(no_cap) + return vects + + +def print_segment(sentence, names): + words = sentence.split() + for name in names: + for i in name: + sys.stdout.write(words[i] + " ") + sys.stdout.write("\n") + + + +# Now let's make some training data. Each example is a sentence as well as a +# set of ranges which indicate the locations of any names. +names = dlib.ranges() # make an array of dlib.range objects. +segments = dlib.rangess() # make an array of arrays of dlib.range objects. +sentences = [] + +sentences.append("The other day I saw a man named Jim Smith") +# We want to detect person names. So we note that the name is located within +# the range [8, 10). Note that we use half open ranges to identify segments. +# So in this case, the segment identifies the string "Jim Smith". +names.append(dlib.range(8, 10)) +segments.append(names) +names.clear() # make names empty for use again below + +sentences.append("Davis King is the main author of the dlib Library") +names.append(dlib.range(0, 2)) +segments.append(names) +names.clear() + +sentences.append("Bob Jones is a name and so is George Clinton") +names.append(dlib.range(0, 2)) +names.append(dlib.range(8, 10)) +segments.append(names) +names.clear() + +sentences.append("My dog is named Bob Barker") +names.append(dlib.range(4, 6)) +segments.append(names) +names.clear() + +sentences.append("ABC is an acronym but John James Smith is a name") +names.append(dlib.range(5, 8)) +segments.append(names) +names.clear() + +sentences.append("No names in this sentence at all") +segments.append(names) +names.clear() + + +# Now before we can pass these training sentences to the dlib tools we need to +# convert them into arrays of vectors as discussed above. We can use either a +# sparse or dense representation depending on our needs. In this example, we +# show how to do it both ways. +use_sparse_vects = False +if use_sparse_vects: + # Make an array of arrays of dlib.sparse_vector objects. + training_sequences = dlib.sparse_vectorss() + for s in sentences: + training_sequences.append(sentence_to_sparse_vectors(s)) +else: + # Make an array of arrays of dlib.vector objects. + training_sequences = dlib.vectorss() + for s in sentences: + training_sequences.append(sentence_to_vectors(s)) + +# Now that we have a simple training set we can train a sequence segmenter. +# However, the sequence segmentation trainer has some optional parameters we can +# set. These parameters determine properties of the segmentation model we will +# learn. See the dlib documentation for the sequence_segmenter object for a +# full discussion of their meanings. +params = dlib.segmenter_params() +params.window_size = 3 +params.use_high_order_features = True +params.use_BIO_model = True +# This is the common SVM C parameter. Larger values encourage the trainer to +# attempt to fit the data exactly but might overfit. In general, you determine +# this parameter by cross-validation. +params.C = 10 + +# Train a model. The model object is responsible for predicting the locations +# of names in new sentences. +model = dlib.train_sequence_segmenter(training_sequences, segments, params) + +# Let's print out the things the model thinks are names. The output is a set +# of ranges which are predicted to contain names. If you run this example +# program you will see that it gets them all correct. +for i, s in enumerate(sentences): + print_segment(s, model(training_sequences[i])) + +# Let's also try segmenting a new sentence. This will print out "Bob Bucket". +# Note that we need to remember to use the same vector representation as we used +# during training. +test_sentence = "There once was a man from Nantucket " \ + "whose name rhymed with Bob Bucket" +if use_sparse_vects: + print_segment(test_sentence, + model(sentence_to_sparse_vectors(test_sentence))) +else: + print_segment(test_sentence, model(sentence_to_vectors(test_sentence))) + +# We can also measure the accuracy of a model relative to some labeled data. +# This statement prints the precision, recall, and F1-score of the model +# relative to the data in training_sequences/segments. +print("Test on training data: {}".format( + dlib.test_sequence_segmenter(model, training_sequences, segments))) + +# We can also do 5-fold cross-validation and print the resulting precision, +# recall, and F1-score. +print("Cross validation: {}".format( + dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5, + params))) |