| | 1 | r""" |
|---|
| | 2 | Hidden Markov Models |
|---|
| | 3 | |
|---|
| | 4 | AUTHOR: William Stein |
|---|
| | 5 | |
|---|
| | 6 | EXAMPLES: |
|---|
| | 7 | |
|---|
| | 8 | """ |
|---|
| | 9 | |
|---|
| | 10 | import math |
|---|
| | 11 | |
|---|
| | 12 | from sage.matrix.all import is_Matrix, matrix |
|---|
| | 13 | from sage.rings.all import RDF |
|---|
| | 14 | |
|---|
| | 15 | from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense |
|---|
| | 16 | |
|---|
| | 17 | include "../../ext/stdsage.pxi" |
|---|
| | 18 | |
|---|
| | 19 | cdef extern from "ghmm/ghmm.h": |
|---|
| | 20 | cdef int GHMM_kSilentStates |
|---|
| | 21 | cdef int GHMM_kHigherOrderEmissions |
|---|
| | 22 | cdef int GHMM_kDiscreteHMM |
|---|
| | 23 | |
|---|
| | 24 | cdef extern from "ghmm/model.h": |
|---|
| | 25 | ctypedef struct ghmm_dstate: |
|---|
| | 26 | # Initial probability |
|---|
| | 27 | double pi |
|---|
| | 28 | # Output probability |
|---|
| | 29 | double *b |
|---|
| | 30 | |
|---|
| | 31 | # ID's of the following states |
|---|
| | 32 | int *out_id |
|---|
| | 33 | # ID's of the previous states |
|---|
| | 34 | int *in_id |
|---|
| | 35 | |
|---|
| | 36 | # Transition probabilities to successor states. |
|---|
| | 37 | double *out_a |
|---|
| | 38 | # Transition probabilities from predecessor states. |
|---|
| | 39 | double *in_a |
|---|
| | 40 | |
|---|
| | 41 | # Number of successor states |
|---|
| | 42 | int out_states |
|---|
| | 43 | # Number of precursor states |
|---|
| | 44 | int in_states |
|---|
| | 45 | # if fix == 1 then output probabilities b stay fixed during the training |
|---|
| | 46 | int fix |
|---|
| | 47 | # Contains a description of the state (null terminated utf-8) |
|---|
| | 48 | char * desc |
|---|
| | 49 | # x coordinate position for graph representation plotting |
|---|
| | 50 | int xPosition |
|---|
| | 51 | # y coordinate position for graph representation plotting |
|---|
| | 52 | int yPosition |
|---|
| | 53 | |
|---|
| | 54 | ctypedef struct ghmm_dbackground: |
|---|
| | 55 | pass |
|---|
| | 56 | |
|---|
| | 57 | ctypedef struct ghmm_alphabet: |
|---|
| | 58 | pass |
|---|
| | 59 | |
|---|
| | 60 | ctypedef struct ghmm_dmodel: |
|---|
| | 61 | # Number of states |
|---|
| | 62 | int N |
|---|
| | 63 | # Number of outputs |
|---|
| | 64 | int M |
|---|
| | 65 | # Vector of states |
|---|
| | 66 | ghmm_dstate *s |
|---|
| | 67 | # Contains bit flags for varios model extensions such as |
|---|
| | 68 | # kSilentStates, kTiedEmissions (see ghmm.h for a complete list) |
|---|
| | 69 | int model_type |
|---|
| | 70 | # The a priori probability for the model. |
|---|
| | 71 | # A value of -1 indicates that no prior is defined. |
|---|
| | 72 | # Note: this is not to be confused with priors on emission distributions. |
|---|
| | 73 | double prior |
|---|
| | 74 | # An arbitrary name for the model (null terminated utf-8) |
|---|
| | 75 | char * name |
|---|
| | 76 | |
|---|
| | 77 | # Flag variables for each state indicating whether it is emitting or not. |
|---|
| | 78 | # Note: silent != NULL iff (model_type & kSilentStates) == 1 |
|---|
| | 79 | int *silent |
|---|
| | 80 | |
|---|
| | 81 | # Int variable for the maximum level of higher order emissions. |
|---|
| | 82 | int maxorder |
|---|
| | 83 | |
|---|
| | 84 | # saves the history of emissions as int, |
|---|
| | 85 | # the nth-last emission is (emission_history * |alphabet|^n+1) % |alphabet| |
|---|
| | 86 | int emission_history |
|---|
| | 87 | |
|---|
| | 88 | # Flag variables for each state indicating whether the states emissions |
|---|
| | 89 | # are tied to another state. Groups of tied states are represented |
|---|
| | 90 | # by their tie group leader (the lowest numbered member of the group). |
|---|
| | 91 | # tied_to[s] == kUntied : s is not a tied state |
|---|
| | 92 | # tied_to[s] == s : s is a tie group leader |
|---|
| | 93 | # tied_to[t] == s : t is tied to state s (t>s) |
|---|
| | 94 | # Note: tied_to != NULL iff (model_type & kTiedEmissions) != 0 */ |
|---|
| | 95 | int *tied_to |
|---|
| | 96 | |
|---|
| | 97 | # Note: State store order information of the emissions. |
|---|
| | 98 | # Classical HMMS have emission order 0; that is, the emission probability |
|---|
| | 99 | # is conditioned only on the state emitting the symbol. |
|---|
| | 100 | # For higher order emissions, the emission are conditioned on the state s |
|---|
| | 101 | # as well as the previous emission_order observed symbols. |
|---|
| | 102 | # The emissions are stored in the state's usual double* b. |
|---|
| | 103 | # Note: order != NULL iff (model_type & kHigherOrderEmissions) != 0 */ |
|---|
| | 104 | int * order |
|---|
| | 105 | |
|---|
| | 106 | # ghmm_dbackground is a pointer to a |
|---|
| | 107 | # ghmm_dbackground structure, which holds (essentially) an |
|---|
| | 108 | # array of background distributions (which are just vectors of floating |
|---|
| | 109 | # point numbers like state.b). |
|---|
| | 110 | # For each state the array background_id indicates which of the background |
|---|
| | 111 | # distributions to use in parameter estimation. A value of kNoBackgroundDistribution |
|---|
| | 112 | # indicates that none should be used. |
|---|
| | 113 | # Note: background_id != NULL iff (model_type & kHasBackgroundDistributions) != 0 |
|---|
| | 114 | int *background_id |
|---|
| | 115 | ghmm_dbackground *bp |
|---|
| | 116 | |
|---|
| | 117 | # Topological ordering of silent states |
|---|
| | 118 | # Condition: topo_order != NULL iff (model_type & kSilentStates) != 0 |
|---|
| | 119 | int *topo_order |
|---|
| | 120 | int topo_order_length |
|---|
| | 121 | |
|---|
| | 122 | # pow_lookup is a array of precomputed powers |
|---|
| | 123 | # It contains in the i-th entry M (alphabet size) to the power of i |
|---|
| | 124 | # The last entry is maxorder+1. |
|---|
| | 125 | int *pow_lookup |
|---|
| | 126 | |
|---|
| | 127 | # Store for each state a class label. Limits the possibly state sequence |
|---|
| | 128 | # Note: label != NULL iff (model_type & kLabeledStates) != 0 |
|---|
| | 129 | int* label |
|---|
| | 130 | ghmm_alphabet* label_alphabet |
|---|
| | 131 | ghmm_alphabet* alphabet |
|---|
| | 132 | |
|---|
| | 133 | |
|---|
| | 134 | int ghmm_dmodel_normalize(ghmm_dmodel *m) |
|---|
| | 135 | int ghmm_dmodel_free(ghmm_dmodel **m) |
|---|
| | 136 | |
|---|
| | 137 | |
|---|
| | 138 | cdef class HiddenMarkovModel: |
|---|
| | 139 | pass |
|---|
| | 140 | |
|---|
| | 141 | |
|---|
| | 142 | cdef class DiscreteHiddenMarkovModel: |
|---|
| | 143 | cdef ghmm_dmodel* m |
|---|
| | 144 | cdef bint initialized |
|---|
| | 145 | cdef object emission_symbols |
|---|
| | 146 | cdef Matrix_real_double_dense A, B |
|---|
| | 147 | |
|---|
| | 148 | def __init__(self, A, B, pi, emission_symbols=None, name=None): |
|---|
| | 149 | """ |
|---|
| | 150 | INPUTS: |
|---|
| | 151 | A -- square matrix of doubles; the state change probabilities |
|---|
| | 152 | B -- matrix of doubles; emission probabilities |
|---|
| | 153 | pi -- list of doubles; probabilities for each initial state |
|---|
| | 154 | emission_state -- list of B.ncols() symbols (just used for printing) |
|---|
| | 155 | name -- (optional) name of the model |
|---|
| | 156 | |
|---|
| | 157 | EXAMPLES: |
|---|
| | 158 | We create a discrete HMM with 2 internal states on an alphabet of |
|---|
| | 159 | size 2. |
|---|
| | 160 | |
|---|
| | 161 | sage: a = hmm.DiscreteHiddenMarkovModel([[0.2,0.8],[0.5,0.5]], [[1,0],[0,1]], [0,1]) |
|---|
| | 162 | |
|---|
| | 163 | """ |
|---|
| | 164 | if not is_Matrix(A): |
|---|
| | 165 | A = matrix(RDF, len(A), len(A[0]), A) |
|---|
| | 166 | if not is_Matrix(B): |
|---|
| | 167 | B = matrix(RDF, len(B), len(B[0]), B) |
|---|
| | 168 | if not A.is_square(): |
|---|
| | 169 | raise ValueError, "A must be square" |
|---|
| | 170 | if A.nrows() != B.nrows(): |
|---|
| | 171 | raise ValueError, "number of rows of A and B must be the same" |
|---|
| | 172 | if A.base_ring() != RDF: |
|---|
| | 173 | A = A.change_ring(RDF) |
|---|
| | 174 | if B.base_ring() != RDF: |
|---|
| | 175 | B = B.change_ring(RDF) |
|---|
| | 176 | if len(pi) != A.nrows(): |
|---|
| | 177 | raise ValueError, "length of pi must equal number of rows of A" |
|---|
| | 178 | |
|---|
| | 179 | cdef Py_ssize_t i, j, k |
|---|
| | 180 | |
|---|
| | 181 | if emission_symbols is None: |
|---|
| | 182 | self.emission_symbols = range(B.ncols()) |
|---|
| | 183 | else: |
|---|
| | 184 | self.emission_symbols = list(emission_symbols) |
|---|
| | 185 | |
|---|
| | 186 | self.m = <ghmm_dmodel*> sage_malloc(sizeof(ghmm_dmodel)) |
|---|
| | 187 | if not self.m: raise MemoryError |
|---|
| | 188 | |
|---|
| | 189 | # Set all pointers to NULL |
|---|
| | 190 | self.m.s = NULL; self.m.name = NULL; self.m.silent = NULL |
|---|
| | 191 | self.m.tied_to = NULL; self.m.order = NULL; self.m.background_id = NULL |
|---|
| | 192 | self.m.bp = NULL; self.m.topo_order = NULL; self.m.pow_lookup = NULL; |
|---|
| | 193 | self.m.label = NULL; self.m.label_alphabet = NULL; self.m.alphabet = NULL |
|---|
| | 194 | |
|---|
| | 195 | # Set number of states and number of outputs |
|---|
| | 196 | self.m.N = A.nrows() |
|---|
| | 197 | self.m.M = len(self.emission_symbols) |
|---|
| | 198 | # Set the model type to discrete |
|---|
| | 199 | self.m.model_type = GHMM_kDiscreteHMM |
|---|
| | 200 | |
|---|
| | 201 | self.A = A |
|---|
| | 202 | self.B = B |
|---|
| | 203 | |
|---|
| | 204 | # Set that no a prior model probabilities are set. |
|---|
| | 205 | self.m.prior = -1 |
|---|
| | 206 | # Assign model identifier if specified |
|---|
| | 207 | if name is not None: |
|---|
| | 208 | name = str(name) |
|---|
| | 209 | self.m.name = name |
|---|
| | 210 | else: |
|---|
| | 211 | self.m.name = NULL |
|---|
| | 212 | |
|---|
| | 213 | # Allocate and initialize states |
|---|
| | 214 | cdef ghmm_dstate* states = <ghmm_dstate*> safe_malloc(sizeof(ghmm_dstate) * self.m.N) |
|---|
| | 215 | cdef ghmm_dstate* state |
|---|
| | 216 | |
|---|
| | 217 | silent_states = [] |
|---|
| | 218 | tmp_order = [] |
|---|
| | 219 | |
|---|
| | 220 | for i in range(self.m.N): |
|---|
| | 221 | v = B[i] |
|---|
| | 222 | |
|---|
| | 223 | # Get a reference to the i-th state for convenience of the notation below. |
|---|
| | 224 | state = &(states[i]) |
|---|
| | 225 | state.desc = NULL |
|---|
| | 226 | |
|---|
| | 227 | # Compute state order |
|---|
| | 228 | if self.m.M > 1: |
|---|
| | 229 | order = math.log( len(v), self.m.M ) - 1 |
|---|
| | 230 | else: |
|---|
| | 231 | order = len(v) - 1 |
|---|
| | 232 | |
|---|
| | 233 | # Check for valid number of emission parameters |
|---|
| | 234 | order = int(order) |
|---|
| | 235 | if self.m.M**(order+1) == len(v): |
|---|
| | 236 | tmp_order.append(order) |
|---|
| | 237 | else: |
|---|
| | 238 | raise ValueError, "number of columns (= %s) of B must be a power of the number of emission symbols (= %s)"%( |
|---|
| | 239 | B.ncols(), len(emission_symbols)) |
|---|
| | 240 | |
|---|
| | 241 | state.b = to_double_array(v) |
|---|
| | 242 | state.pi = pi[i] |
|---|
| | 243 | |
|---|
| | 244 | silent_states.append( 1 if sum(v) == 0 else 0) |
|---|
| | 245 | |
|---|
| | 246 | # Set out probabilities, i.e., the probabilities that each |
|---|
| | 247 | # symbol will be emitted from this state. |
|---|
| | 248 | v = A[i] |
|---|
| | 249 | nz = v.nonzero_positions() |
|---|
| | 250 | k = len(nz) |
|---|
| | 251 | state.out_states = k |
|---|
| | 252 | state.out_id = <int*> safe_malloc(sizeof(int)*k) |
|---|
| | 253 | state.out_a = <double*> safe_malloc(sizeof(double)*k) |
|---|
| | 254 | for j in range(k): |
|---|
| | 255 | state.out_id[j] = nz[j] |
|---|
| | 256 | state.out_a[j] = v[nz[j]] |
|---|
| | 257 | |
|---|
| | 258 | # Set "in" probabilities |
|---|
| | 259 | v = A.column(i) |
|---|
| | 260 | nz = v.nonzero_positions() |
|---|
| | 261 | k = len(nz) |
|---|
| | 262 | state.in_states = k |
|---|
| | 263 | state.in_id = <int*> safe_malloc(sizeof(int)*k) |
|---|
| | 264 | state.in_a = <double*> safe_malloc(sizeof(double)*k) |
|---|
| | 265 | for j in range(k): |
|---|
| | 266 | state.in_id[j] = nz[j] |
|---|
| | 267 | state.in_a[j] = v[nz[j]] |
|---|
| | 268 | |
|---|
| | 269 | state.fix = 0 |
|---|
| | 270 | |
|---|
| | 271 | self.m.s = states |
|---|
| | 272 | self.initialized = True; return |
|---|
| | 273 | |
|---|
| | 274 | if sum(silent_states) > 0: |
|---|
| | 275 | self.m.model_type |= GHMM_kSilentStates |
|---|
| | 276 | self.m.silent = to_int_array(silent_states) |
|---|
| | 277 | |
|---|
| | 278 | self.m.maxorder = max(tmp_order) |
|---|
| | 279 | if self.m.maxorder > 0: |
|---|
| | 280 | self.m.model_type |= GHMM_kHigherOrderEmissions |
|---|
| | 281 | self.m.order = to_int_array(tmp_order) |
|---|
| | 282 | |
|---|
| | 283 | # Initialize lookup table for powers of the alphabet size, |
|---|
| | 284 | # which speeds up models with higher order states. |
|---|
| | 285 | powLookUp = [1] * (self.m.maxorder+2) |
|---|
| | 286 | for i in range(1,len(powLookUp)): |
|---|
| | 287 | powLookUp[i] = powLookUp[i-1] * self.m.M |
|---|
| | 288 | self.m.pow_lookup = to_int_array(powLookUp) |
|---|
| | 289 | |
|---|
| | 290 | self.initialized = True |
|---|
| | 291 | |
|---|
| | 292 | |
|---|
| | 293 | def __repr__(self): |
|---|
| | 294 | s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%( |
|---|
| | 295 | ' ' + self.m.name if self.m.name else '', |
|---|
| | 296 | self.m.N, self.m.M) |
|---|
| | 297 | s += 'Initial probabilities: %s\n'%self.initial_probabilities() |
|---|
| | 298 | s += 'Transition matrix:\n%s\n'%self.transition_matrix() |
|---|
| | 299 | s += 'Emission matrix:\n%s\n'%self.emission_matrix() |
|---|
| | 300 | return s |
|---|
| | 301 | |
|---|
| | 302 | def initial_probabilities(self): |
|---|
| | 303 | cdef Py_ssize_t i |
|---|
| | 304 | return [self.m.s[i].pi for i in range(self.m.N)] |
|---|
| | 305 | |
|---|
| | 306 | def transition_matrix(self, list_only=True): |
|---|
| | 307 | cdef Py_ssize_t i, j |
|---|
| | 308 | for i from 0 <= i < self.m.N: |
|---|
| | 309 | for j from 0 <= j < self.m.s[i].out_states: |
|---|
| | 310 | self.A.set_unsafe_double(i,j,self.m.s[i].out_a[j]) |
|---|
| | 311 | return self.A |
|---|
| | 312 | |
|---|
| | 313 | def emission_matrix(self, list_only=True): |
|---|
| | 314 | cdef Py_ssize_t i, j |
|---|
| | 315 | for i from 0 <= i < self.m.N: |
|---|
| | 316 | for j from 0 <= j < self.B._ncols: |
|---|
| | 317 | self.B.set_unsafe_double(i,j,self.m.s[i].b[j]) |
|---|
| | 318 | return self.B |
|---|
| | 319 | |
|---|
| | 320 | def normalize(self): |
|---|
| | 321 | """ |
|---|
| | 322 | Normalize the transition and emission probabilities, if applicable. |
|---|
| | 323 | """ |
|---|
| | 324 | ghmm_dmodel_normalize(self.m) |
|---|
| | 325 | |
|---|
| | 326 | def __dealloc__(self): |
|---|
| | 327 | if self.initialized: |
|---|
| | 328 | ghmm_dmodel_free(&self.m) |
|---|
| | 329 | |
|---|
| | 330 | |
|---|
| | 331 | |
|---|
| | 332 | |
|---|
| | 333 | ################################################################################## |
|---|
| | 334 | # Helper Functions |
|---|
| | 335 | ################################################################################## |
|---|
| | 336 | |
|---|
| | 337 | cdef double* to_double_array(v) except NULL: |
|---|
| | 338 | cdef double x |
|---|
| | 339 | cdef double* w = <double*> safe_malloc(sizeof(double)*len(v)) |
|---|
| | 340 | cdef Py_ssize_t i = 0 |
|---|
| | 341 | for x in v: |
|---|
| | 342 | w[i] = x |
|---|
| | 343 | i += 1 |
|---|
| | 344 | return w |
|---|
| | 345 | |
|---|
| | 346 | cdef int* to_int_array(v) except NULL: |
|---|
| | 347 | cdef int x |
|---|
| | 348 | cdef int* w = <int*> safe_malloc(sizeof(int)*len(v)) |
|---|
| | 349 | cdef Py_ssize_t i = 0 |
|---|
| | 350 | for x in v: |
|---|
| | 351 | w[i] = x |
|---|
| | 352 | i += 1 |
|---|
| | 353 | return w |
|---|
| | 354 | |
|---|
| | 355 | cdef void* safe_malloc(int bytes) except NULL: |
|---|
| | 356 | """ |
|---|
| | 357 | malloc the given bytes of memory and check that the malloc |
|---|
| | 358 | succeeds -- if not raise a MemoryError. |
|---|
| | 359 | """ |
|---|
| | 360 | cdef void* t = sage_malloc(bytes) |
|---|
| | 361 | if not t: |
|---|
| | 362 | raise MemoryError, "error allocating memory for Hidden Markov Model" |
|---|
| | 363 | return t |
|---|
| | 364 | |