Ticket #3726: sage-3726-part3.patch
| File sage-3726-part3.patch, 10.7 kB (added by was, 5 months ago) |
|---|
-
/dev/null
old new 1 # Hidden Markov Models -
a/sage/stats/hmm/hmm.pyx
old new 133 133 ghmm_alphabet* alphabet 134 134 135 135 136 int ghmm_dmodel_normalize(ghmm_dmodel *m)137 int ghmm_dmodel_free(ghmm_dmodel **m)138 int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p)139 140 141 136 # Discrete sequences 142 137 ctypedef struct ghmm_dseq: 143 138 # sequence array. sequence[i] [j] = j-th symbol of i-th seq … … 171 166 ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len, 172 167 long seq_number, int Tmax) 173 168 169 170 int ghmm_dmodel_normalize(ghmm_dmodel *m) 171 int ghmm_dmodel_free(ghmm_dmodel **m) 172 int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p) 173 int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p) 174 int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq) 175 int ghmm_dmodel_baum_welch_nstep (ghmm_dmodel * m, ghmm_dseq *sq, int max_step, 176 double likelihood_delta) 177 178 174 179 cdef class HiddenMarkovModel: 175 180 pass 176 181 … … 178 183 cdef class DiscreteHiddenMarkovModel: 179 184 cdef ghmm_dmodel* m 180 185 cdef bint initialized 181 cdef object _emission_symbols 182 cdef object _emission_symbols_special 186 cdef object _emission_symbols, _emission_symbols_dict 183 187 cdef Matrix_real_double_dense A, B 184 188 185 189 def __init__(self, A, B, pi, emission_symbols=None, name=None): … … 328 332 ghmm_dmodel_free(&self.m) 329 333 330 334 def __repr__(self): 335 """ 336 Return string representation of this HMM. 337 338 OUTPUT: 339 string 340 341 EXAMPLES: 342 sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc']) 343 sage: a.__repr__() 344 "Discrete Hidden Markov Model (2 states, 2 outputs)\n\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']" 345 """ 331 346 s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%( 332 347 ' ' + self.m.name if self.m.name else '', 333 348 self.m.N, self.m.M) 334 s += 'Initial probabilities: %s\n'%self.initial_probabilities() 335 s += 'Transition matrix:\n%s\n'%self.transition_matrix() 336 s += 'Emission matrix:\n%s\n'%self.emission_matrix() 349 s += '\nInitial probabilities: %s'%self.initial_probabilities() 350 s += '\nTransition matrix:\n%s'%self.transition_matrix() 351 s += '\nEmission matrix:\n%s'%self.emission_matrix() 352 if self._emission_symbols_dict: 353 s += '\nEmission symbols: %s'%self._emission_symbols 337 354 return s 338 355 339 356 def initial_probabilities(self): … … 422 439 cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, 1, length) 423 440 cdef Py_ssize_t i 424 441 v = [d.seq[0][i] for i in range(length)] 425 if self._emission_symbols_special: 426 return v 427 else: 442 if self._emission_symbols_dict: 428 443 w = self._emission_symbols 429 444 return [w[i] for i in v] 445 else: 446 return v 430 447 431 448 def sample(self, long length, long number): 432 449 """ … … 442 459 cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, number, length) 443 460 cdef Py_ssize_t i, j 444 461 v = [[d.seq[j][i] for i in range(length)] for j in range(number)] 445 if self._emission_symbols_special: 446 return v 447 else: 462 if self._emission_symbols_dict: 448 463 w = self._emission_symbols 449 464 return [[w[i] for i in z] for z in v] 465 else: 466 return v 450 467 451 468 def emission_symbols(self): 452 469 """ … … 479 496 """ 480 497 if emission_symbols is None: 481 498 self._emission_symbols = range(self.B.ncols()) 499 self._emission_symbols_dict = None 482 500 else: 483 501 self._emission_symbols = list(emission_symbols) 484 self._emission_symbols_special = (self._emission_symbols == range(self.B.ncols())) 502 if self._emission_symbols != range(self.B.ncols()): 503 self._emission_symbols_dict = dict([(x,i) for i, x in enumerate(emission_symbols)]) 485 504 486 cpdef double log_likelihood(self, seq): 505 506 #################################################################### 507 # HMM Problem 1 -- Computing likelihood: Given the parameter set 508 # lambda of an HMM model and an observation sequence O, determine 509 # the likelihood P(O | lambda). 510 #################################################################### 511 def log_likelihood(self, seq): 487 512 r""" 488 Return $\log ( P[seq | model] )$, the log of the probability of seeing 489 the given sequence given this model, using the forward algorithm and 490 assuming independance of the sequence seq. 513 HMM Problem 1: Return $\log ( P[seq | model] )$, the log of 514 the probability of seeing the given sequence given this model, 515 using the forward algorithm and assuming independance of the 516 sequence seq. 491 517 492 WARNING: By convention we return 1.0 for 0 probability events. 518 INPUT: 519 seq -- a list; sequence of observed emissions of the HMM 520 521 OUTPUT: 522 float -- the log of the probability of seeing this sequence 523 given the model 524 525 WARNING: By convention we return -inf for 0 probability events. 493 526 494 527 EXAMPLES: 495 528 sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1]) … … 500 533 501 534 Notice that any sequence starting with 0 can't occur, since 502 535 the above model always starts in a state that produces 1 with 503 probability 1. By convention log(probability 0) is 1.536 probability 1. By convention log(probability 0) is -inf. 504 537 sage: a.log_likelihood([0,0]) 505 1.0538 -inf 506 539 507 540 Here's a special case where each sequence is equally probable. 508 541 sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[1,0],[0,1]], [0.5,0.5]) … … 526 559 return -float('Inf') 527 560 return log_p 528 561 562 #################################################################### 563 # HMM Problem 2 -- Decoding: Given the complete parameter set that 564 # defines the model and an observation sequence seq, determine the 565 # best hidden sequence Q. Use the Viterbi algorithm. 566 #################################################################### 567 def viterbi(self, seq): 568 """ 569 HMM Problem 2: Determine a hidden sequence of states that is 570 most likely to produce the given sequence seq of obserations. 571 572 INPUT: 573 seq -- sequence of emitted symbols 574 575 OUTPUT: 576 list -- a most probable sequence of hidden states, i.e., the 577 Viterbi path. 578 float -- log of the probability that the sequence of hidden 579 states actually produced the given sequence seq. 580 [[TODO: I do not understand precisely what this means.]] 581 582 EXAMPLES: 583 sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5]) 584 sage: a.viterbi([1,0,0,1,0,0,1,1]) 585 ([1, 0, 0, 1, 1, 0, 1, 1], -11.062453224772216) 586 587 We predict the state sequence when the emissions are 3/4 and 'abc'. 588 sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc']) 589 590 Note that state 0 is common below, despite the model trying hard to 591 switch to state 1: 592 sage: a.viterbi([3/4, 'abc', 'abc'] + [3/4]*10) 593 ([0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], -25.299405845367794) 594 """ 595 if self._emission_symbols_dict: 596 seq = [self._emission_symbols_dict[z] for z in seq] 597 cdef int* path 598 cdef int* O = to_int_array(seq) 599 cdef int pathlen 600 cdef double log_p 601 602 path = ghmm_dmodel_viterbi(self.m, O, len(seq), &pathlen, &log_p) 603 sage_free(O) 604 p = [path[i] for i in range(pathlen)] 605 sage_free(path) 606 607 return p, log_p 608 609 #################################################################### 610 # HMM Problem 3 -- Learning: Given an observation sequence O and 611 # the set of states in the HMM, improve the HMM to increase the 612 # probability of observing O. 613 #################################################################### 614 def baum_welch(self, training_seqs, nsteps=None, log_likelihood_cutoff=None): 615 pass 616 ## /** Baum-Welch-Algorithm for parameter reestimation (training) in 617 ## a discrete (discrete output functions) HMM. Scaled version 618 ## for multiple sequences, alpha and beta matrices are allocated with 619 ## ighmm_cmatrix_stat_alloc 620 ## New parameters set directly in hmm (no storage of previous values!). 621 ## For reference see: 622 ## Rabiner, L.R.: "`A Tutorial on Hidden {Markov} Models and Selected 623 ## Applications in Speech Recognition"', Proceedings of the IEEE, 624 ## 77, no 2, 1989, pp 257--285 625 ## @return 0/-1 success/error 626 ## @param mo initial model 627 ## @param sq training sequences 628 ## */ 629 ## /** Just like reestimate_baum_welch, but you can limit 630 ## the maximum number of steps 631 ## @return 0/-1 success/error 632 ## @param mo initial model 633 ## @param sq training sequences 634 ## @param max_step maximal number of Baum-Welch steps 635 ## @param likelihood_delta minimal improvement in likelihood required for carrying on. Relative value 636 ## to log likelihood 637 ## */ 638 639 640 641 642 529 643 530 644 531 645 ##################################################################################