summaryrefslogtreecommitdiffstats
path: root/ml/dlib/python_examples/sequence_segmenter.py
diff options
context:
space:
mode:
Diffstat (limited to 'ml/dlib/python_examples/sequence_segmenter.py')
-rwxr-xr-xml/dlib/python_examples/sequence_segmenter.py197
1 files changed, 0 insertions, 197 deletions
diff --git a/ml/dlib/python_examples/sequence_segmenter.py b/ml/dlib/python_examples/sequence_segmenter.py
deleted file mode 100755
index 335e475f7..000000000
--- a/ml/dlib/python_examples/sequence_segmenter.py
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/usr/bin/python
-# The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
-#
-# This example shows how to use dlib to learn to do sequence segmentation. In
-# a sequence segmentation task we are given a sequence of objects (e.g. words in
-# a sentence) and we are supposed to detect certain subsequences (e.g. the names
-# of people). Therefore, in the code below we create some very simple training
-# sequences and use them to learn a sequence segmentation model. In particular,
-# our sequences will be sentences represented as arrays of words and our task
-# will be to learn to identify person names. Once we have our segmentation
-# model we can use it to find names in new sentences, as we will show.
-#
-# COMPILING/INSTALLING THE DLIB PYTHON INTERFACE
-# You can install dlib using the command:
-# pip install dlib
-#
-# Alternatively, if you want to compile dlib yourself then go into the dlib
-# root folder and run:
-# python setup.py install
-# or
-# python setup.py install --yes USE_AVX_INSTRUCTIONS
-# if you have a CPU that supports AVX instructions, since this makes some
-# things run faster.
-#
-# Compiling dlib should work on any operating system so long as you have
-# CMake installed. On Ubuntu, this can be done easily by running the
-# command:
-# sudo apt-get install cmake
-#
-import sys
-import dlib
-
-
-# The sequence segmentation models we work with in this example are chain
-# structured conditional random field style models. Therefore, central to a
-# sequence segmentation model is some method for converting the elements of a
-# sequence into feature vectors. That is, while you might start out representing
-# your sequence as an array of strings, the dlib interface works in terms of
-# arrays of feature vectors. Each feature vector should capture important
-# information about its corresponding element in the original raw sequence. So
-# in this example, since we work with sequences of words and want to identify
-# names, we will create feature vectors that tell us if the word is capitalized
-# or not. In our simple data, this will be enough to identify names.
-# Therefore, we define sentence_to_vectors() which takes a sentence represented
-# as a string and converts it into an array of words and then associates a
-# feature vector with each word.
-def sentence_to_vectors(sentence):
- # Create an empty array of vectors
- vects = dlib.vectors()
- for word in sentence.split():
- # Our vectors are very simple 1-dimensional vectors. The value of the
- # single feature is 1 if the first letter of the word is capitalized and
- # 0 otherwise.
- if word[0].isupper():
- vects.append(dlib.vector([1]))
- else:
- vects.append(dlib.vector([0]))
- return vects
-
-
-# Dlib also supports the use of a sparse vector representation. This is more
-# efficient than the above form when you have very high dimensional vectors that
-# are mostly full of zeros. In dlib, each sparse vector is represented as an
-# array of pair objects. Each pair contains an index and value. Any index not
-# listed in the vector is implicitly associated with a value of zero.
-# Additionally, when using sparse vectors with dlib.train_sequence_segmenter()
-# you can use "unsorted" sparse vectors. This means you can add the index/value
-# pairs into your sparse vectors in any order you want and don't need to worry
-# about them being in sorted order.
-def sentence_to_sparse_vectors(sentence):
- vects = dlib.sparse_vectors()
- has_cap = dlib.sparse_vector()
- no_cap = dlib.sparse_vector()
- # make has_cap equivalent to dlib.vector([1])
- has_cap.append(dlib.pair(0, 1))
-
- # Since we didn't add anything to no_cap it is equivalent to
- # dlib.vector([0])
- for word in sentence.split():
- if word[0].isupper():
- vects.append(has_cap)
- else:
- vects.append(no_cap)
- return vects
-
-
-def print_segment(sentence, names):
- words = sentence.split()
- for name in names:
- for i in name:
- sys.stdout.write(words[i] + " ")
- sys.stdout.write("\n")
-
-
-
-# Now let's make some training data. Each example is a sentence as well as a
-# set of ranges which indicate the locations of any names.
-names = dlib.ranges() # make an array of dlib.range objects.
-segments = dlib.rangess() # make an array of arrays of dlib.range objects.
-sentences = []
-
-sentences.append("The other day I saw a man named Jim Smith")
-# We want to detect person names. So we note that the name is located within
-# the range [8, 10). Note that we use half open ranges to identify segments.
-# So in this case, the segment identifies the string "Jim Smith".
-names.append(dlib.range(8, 10))
-segments.append(names)
-names.clear() # make names empty for use again below
-
-sentences.append("Davis King is the main author of the dlib Library")
-names.append(dlib.range(0, 2))
-segments.append(names)
-names.clear()
-
-sentences.append("Bob Jones is a name and so is George Clinton")
-names.append(dlib.range(0, 2))
-names.append(dlib.range(8, 10))
-segments.append(names)
-names.clear()
-
-sentences.append("My dog is named Bob Barker")
-names.append(dlib.range(4, 6))
-segments.append(names)
-names.clear()
-
-sentences.append("ABC is an acronym but John James Smith is a name")
-names.append(dlib.range(5, 8))
-segments.append(names)
-names.clear()
-
-sentences.append("No names in this sentence at all")
-segments.append(names)
-names.clear()
-
-
-# Now before we can pass these training sentences to the dlib tools we need to
-# convert them into arrays of vectors as discussed above. We can use either a
-# sparse or dense representation depending on our needs. In this example, we
-# show how to do it both ways.
-use_sparse_vects = False
-if use_sparse_vects:
- # Make an array of arrays of dlib.sparse_vector objects.
- training_sequences = dlib.sparse_vectorss()
- for s in sentences:
- training_sequences.append(sentence_to_sparse_vectors(s))
-else:
- # Make an array of arrays of dlib.vector objects.
- training_sequences = dlib.vectorss()
- for s in sentences:
- training_sequences.append(sentence_to_vectors(s))
-
-# Now that we have a simple training set we can train a sequence segmenter.
-# However, the sequence segmentation trainer has some optional parameters we can
-# set. These parameters determine properties of the segmentation model we will
-# learn. See the dlib documentation for the sequence_segmenter object for a
-# full discussion of their meanings.
-params = dlib.segmenter_params()
-params.window_size = 3
-params.use_high_order_features = True
-params.use_BIO_model = True
-# This is the common SVM C parameter. Larger values encourage the trainer to
-# attempt to fit the data exactly but might overfit. In general, you determine
-# this parameter by cross-validation.
-params.C = 10
-
-# Train a model. The model object is responsible for predicting the locations
-# of names in new sentences.
-model = dlib.train_sequence_segmenter(training_sequences, segments, params)
-
-# Let's print out the things the model thinks are names. The output is a set
-# of ranges which are predicted to contain names. If you run this example
-# program you will see that it gets them all correct.
-for i, s in enumerate(sentences):
- print_segment(s, model(training_sequences[i]))
-
-# Let's also try segmenting a new sentence. This will print out "Bob Bucket".
-# Note that we need to remember to use the same vector representation as we used
-# during training.
-test_sentence = "There once was a man from Nantucket " \
- "whose name rhymed with Bob Bucket"
-if use_sparse_vects:
- print_segment(test_sentence,
- model(sentence_to_sparse_vectors(test_sentence)))
-else:
- print_segment(test_sentence, model(sentence_to_vectors(test_sentence)))
-
-# We can also measure the accuracy of a model relative to some labeled data.
-# This statement prints the precision, recall, and F1-score of the model
-# relative to the data in training_sequences/segments.
-print("Test on training data: {}".format(
- dlib.test_sequence_segmenter(model, training_sequences, segments)))
-
-# We can also do 5-fold cross-validation and print the resulting precision,
-# recall, and F1-score.
-print("Cross validation: {}".format(
- dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5,
- params)))