1 files changed, 197 insertions, 0 deletions
diff --git a/ml/dlib/python_examples/sequence_segmenter.py b/ml/dlib/python_examples/sequence_segmenter.py
new file mode 100755
index 00000000..335e475f
--- /dev/null
+++ b/ml/dlib/python_examples/sequence_segmenter.py
@@ -0,0 +1,197 @@
+#!/usr/bin/python
+# The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+#
+# This example shows how to use dlib to learn to do sequence segmentation.  In
+# a sequence segmentation task we are given a sequence of objects (e.g. words in
+# a sentence) and we are supposed to detect certain subsequences (e.g. the names
+# of people).  Therefore, in the code below we create some very simple training
+# sequences and use them to learn a sequence segmentation model.  In particular,
+# our sequences will be sentences represented as arrays of words and our task
+# will be to learn to identify person names.  Once we have our segmentation
+# model we can use it to find names in new sentences, as we will show.
+#
+# COMPILING/INSTALLING THE DLIB PYTHON INTERFACE
+#   You can install dlib using the command:
+#       pip install dlib
+#
+#   Alternatively, if you want to compile dlib yourself then go into the dlib
+#   root folder and run:
+#       python setup.py install
+#   or
+#       python setup.py install --yes USE_AVX_INSTRUCTIONS
+#   if you have a CPU that supports AVX instructions, since this makes some
+#   things run faster.  
+#
+#   Compiling dlib should work on any operating system so long as you have
+#   CMake installed.  On Ubuntu, this can be done easily by running the
+#   command:
+#       sudo apt-get install cmake
+#
+import sys
+import dlib
+
+
+# The sequence segmentation models we work with in this example are chain
+# structured conditional random field style models.  Therefore, central to a
+# sequence segmentation model is some method for converting the elements of a
+# sequence into feature vectors. That is, while you might start out representing
+# your sequence as an array of strings, the dlib interface works in terms of
+# arrays of feature vectors.  Each feature vector should capture important
+# information about its corresponding element in the original raw sequence.  So
+# in this example, since we work with sequences of words and want to identify
+# names, we will create feature vectors that tell us if the word is capitalized
+# or not.  In our simple data, this will be enough to identify names.
+# Therefore, we define sentence_to_vectors() which takes a sentence represented
+# as a string and converts it into an array of words and then associates a
+# feature vector with each word.
+def sentence_to_vectors(sentence):
+    # Create an empty array of vectors
+    vects = dlib.vectors()
+    for word in sentence.split():
+        # Our vectors are very simple 1-dimensional vectors.  The value of the
+        # single feature is 1 if the first letter of the word is capitalized and
+        # 0 otherwise.
+        if word[0].isupper():
+            vects.append(dlib.vector([1]))
+        else:
+            vects.append(dlib.vector([0]))
+    return vects
+
+
+# Dlib also supports the use of a sparse vector representation.  This is more
+# efficient than the above form when you have very high dimensional vectors that
+# are mostly full of zeros.  In dlib, each sparse vector is represented as an
+# array of pair objects.  Each pair contains an index and value.  Any index not
+# listed in the vector is implicitly associated with a value of zero.
+# Additionally, when using sparse vectors with dlib.train_sequence_segmenter()
+# you can use "unsorted" sparse vectors.  This means you can add the index/value
+# pairs into your sparse vectors in any order you want and don't need to worry
+# about them being in sorted order.
+def sentence_to_sparse_vectors(sentence):
+    vects = dlib.sparse_vectors()
+    has_cap = dlib.sparse_vector()
+    no_cap = dlib.sparse_vector()
+    # make has_cap equivalent to dlib.vector([1])
+    has_cap.append(dlib.pair(0, 1))
+
+    # Since we didn't add anything to no_cap it is equivalent to
+    # dlib.vector([0])
+    for word in sentence.split():
+        if word[0].isupper():
+            vects.append(has_cap)
+        else:
+            vects.append(no_cap)
+    return vects
+
+
+def print_segment(sentence, names):
+    words = sentence.split()
+    for name in names:
+        for i in name:
+            sys.stdout.write(words[i] + " ")
+        sys.stdout.write("\n")
+
+
+
+# Now let's make some training data.  Each example is a sentence as well as a
+# set of ranges which indicate the locations of any names.   
+names = dlib.ranges()     # make an array of dlib.range objects.
+segments = dlib.rangess() # make an array of arrays of dlib.range objects.
+sentences = []
+
+sentences.append("The other day I saw a man named Jim Smith")
+# We want to detect person names.  So we note that the name is located within
+# the range [8, 10).  Note that we use half open ranges to identify segments.
+# So in this case, the segment identifies the string "Jim Smith".
+names.append(dlib.range(8, 10))
+segments.append(names)
+names.clear() # make names empty for use again below
+
+sentences.append("Davis King is the main author of the dlib Library")
+names.append(dlib.range(0, 2))
+segments.append(names)
+names.clear()
+
+sentences.append("Bob Jones is a name and so is George Clinton")
+names.append(dlib.range(0, 2))
+names.append(dlib.range(8, 10))
+segments.append(names)
+names.clear()
+
+sentences.append("My dog is named Bob Barker")
+names.append(dlib.range(4, 6))
+segments.append(names)
+names.clear()
+
+sentences.append("ABC is an acronym but John James Smith is a name")
+names.append(dlib.range(5, 8))
+segments.append(names)
+names.clear()
+
+sentences.append("No names in this sentence at all")
+segments.append(names)
+names.clear()
+
+
+# Now before we can pass these training sentences to the dlib tools we need to
+# convert them into arrays of vectors as discussed above.  We can use either a
+# sparse or dense representation depending on our needs.  In this example, we
+# show how to do it both ways.
+use_sparse_vects = False
+if use_sparse_vects:
+    # Make an array of arrays of dlib.sparse_vector objects.
+    training_sequences = dlib.sparse_vectorss()
+    for s in sentences:
+        training_sequences.append(sentence_to_sparse_vectors(s))
+else:
+    # Make an array of arrays of dlib.vector objects.
+    training_sequences = dlib.vectorss()
+    for s in sentences:
+        training_sequences.append(sentence_to_vectors(s))
+
+# Now that we have a simple training set we can train a sequence segmenter.
+# However, the sequence segmentation trainer has some optional parameters we can
+# set.  These parameters determine properties of the segmentation model we will
+# learn.  See the dlib documentation for the sequence_segmenter object for a
+# full discussion of their meanings.
+params = dlib.segmenter_params()
+params.window_size = 3
+params.use_high_order_features = True
+params.use_BIO_model = True
+# This is the common SVM C parameter.  Larger values encourage the trainer to
+# attempt to fit the data exactly but might overfit.  In general, you determine
+# this parameter by cross-validation.
+params.C = 10
+
+# Train a model.  The model object is responsible for predicting the locations
+# of names in new sentences.
+model = dlib.train_sequence_segmenter(training_sequences, segments, params)
+
+# Let's print out the things the model thinks are names.  The output is a set
+# of ranges which are predicted to contain names.  If you run this example
+# program you will see that it gets them all correct.
+for i, s in enumerate(sentences):
+    print_segment(s, model(training_sequences[i]))
+
+# Let's also try segmenting a new sentence.  This will print out "Bob Bucket".
+# Note that we need to remember to use the same vector representation as we used
+# during training.
+test_sentence = "There once was a man from Nantucket " \
+                "whose name rhymed with Bob Bucket"
+if use_sparse_vects:
+    print_segment(test_sentence,
+                  model(sentence_to_sparse_vectors(test_sentence)))
+else:
+    print_segment(test_sentence, model(sentence_to_vectors(test_sentence)))
+
+# We can also measure the accuracy of a model relative to some labeled data.
+# This statement prints the precision, recall, and F1-score of the model
+# relative to the data in training_sequences/segments.
+print("Test on training data: {}".format(
+      dlib.test_sequence_segmenter(model, training_sequences, segments)))
+
+# We can also do 5-fold cross-validation and print the resulting precision,
+# recall, and F1-score.
+print("Cross validation: {}".format(
+      dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5,
+                                             params)))