Ticket #3726: sage-3726-part3.patch

File sage-3726-part3.patch, 10.7 kB (added by was, 5 months ago)
  • /dev/null

    old new  
     1# Hidden Markov Models 
  • a/sage/stats/hmm/hmm.pyx

    old new  
    133133        ghmm_alphabet* alphabet 
    134134 
    135135 
    136     int ghmm_dmodel_normalize(ghmm_dmodel *m) 
    137     int ghmm_dmodel_free(ghmm_dmodel **m) 
    138     int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p) 
    139  
    140  
    141136    # Discrete sequences 
    142137    ctypedef struct ghmm_dseq: 
    143138        # sequence array. sequence[i] [j] = j-th symbol of i-th seq 
     
    171166    ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len, 
    172167                                          long seq_number, int Tmax) 
    173168 
     169 
     170    int ghmm_dmodel_normalize(ghmm_dmodel *m) 
     171    int ghmm_dmodel_free(ghmm_dmodel **m) 
     172    int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p) 
     173    int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p) 
     174    int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq) 
     175    int ghmm_dmodel_baum_welch_nstep (ghmm_dmodel * m, ghmm_dseq *sq, int max_step, 
     176                                      double likelihood_delta) 
     177     
     178 
    174179cdef class HiddenMarkovModel: 
    175180    pass 
    176181 
     
    178183cdef class DiscreteHiddenMarkovModel: 
    179184    cdef ghmm_dmodel* m 
    180185    cdef bint initialized 
    181     cdef object _emission_symbols 
    182     cdef object _emission_symbols_special     
     186    cdef object _emission_symbols, _emission_symbols_dict 
    183187    cdef Matrix_real_double_dense A, B 
    184188     
    185189    def __init__(self, A, B, pi, emission_symbols=None, name=None): 
     
    328332            ghmm_dmodel_free(&self.m) 
    329333 
    330334    def __repr__(self): 
     335        """ 
     336        Return string representation of this HMM. 
     337 
     338        OUTPUT: 
     339            string 
     340 
     341        EXAMPLES: 
     342            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc']) 
     343            sage: a.__repr__() 
     344            "Discrete Hidden Markov Model (2 states, 2 outputs)\n\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']" 
     345        """ 
    331346        s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%( 
    332347            ' ' + self.m.name if self.m.name else '', 
    333348            self.m.N, self.m.M) 
    334         s += 'Initial probabilities: %s\n'%self.initial_probabilities() 
    335         s += 'Transition matrix:\n%s\n'%self.transition_matrix() 
    336         s += 'Emission matrix:\n%s\n'%self.emission_matrix() 
     349        s += '\nInitial probabilities: %s'%self.initial_probabilities() 
     350        s += '\nTransition matrix:\n%s'%self.transition_matrix() 
     351        s += '\nEmission matrix:\n%s'%self.emission_matrix() 
     352        if self._emission_symbols_dict: 
     353            s += '\nEmission symbols: %s'%self._emission_symbols 
    337354        return s 
    338355 
    339356    def initial_probabilities(self): 
     
    422439        cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, 1, length) 
    423440        cdef Py_ssize_t i 
    424441        v = [d.seq[0][i] for i in range(length)] 
    425         if self._emission_symbols_special: 
    426             return v 
    427         else: 
     442        if self._emission_symbols_dict: 
    428443            w = self._emission_symbols 
    429444            return [w[i] for i in v] 
     445        else: 
     446            return v 
    430447 
    431448    def sample(self, long length, long number): 
    432449        """ 
     
    442459        cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, number, length) 
    443460        cdef Py_ssize_t i, j 
    444461        v = [[d.seq[j][i] for i in range(length)] for j in range(number)] 
    445         if self._emission_symbols_special: 
    446             return v 
    447         else: 
     462        if self._emission_symbols_dict: 
    448463            w = self._emission_symbols 
    449464            return [[w[i] for i in z] for z in v] 
     465        else: 
     466            return v 
    450467 
    451468    def emission_symbols(self): 
    452469        """ 
     
    479496        """ 
    480497        if emission_symbols is None: 
    481498            self._emission_symbols = range(self.B.ncols()) 
     499            self._emission_symbols_dict = None 
    482500        else: 
    483501            self._emission_symbols = list(emission_symbols) 
    484         self._emission_symbols_special = (self._emission_symbols == range(self.B.ncols())) 
     502            if self._emission_symbols != range(self.B.ncols()): 
     503                self._emission_symbols_dict = dict([(x,i) for i, x in enumerate(emission_symbols)]) 
    485504 
    486     cpdef double log_likelihood(self, seq): 
     505 
     506    #################################################################### 
     507    # HMM Problem 1 -- Computing likelihood: Given the parameter set 
     508    # lambda of an HMM model and an observation sequence O, determine 
     509    # the likelihood P(O | lambda). 
     510    #################################################################### 
     511    def log_likelihood(self, seq): 
    487512        r""" 
    488         Return $\log ( P[seq | model] )$, the log of the probability of seeing 
    489         the given sequence given this model, using the forward algorithm and 
    490         assuming independance of the sequence seq. 
     513        HMM Problem 1: Return $\log ( P[seq | model] )$, the log of 
     514        the probability of seeing the given sequence given this model, 
     515        using the forward algorithm and assuming independance of the 
     516        sequence seq. 
    491517 
    492         WARNING: By convention we return 1.0 for 0 probability events.  
     518        INPUT: 
     519            seq -- a list; sequence of observed emissions of the HMM 
     520 
     521        OUTPUT: 
     522            float -- the log of the probability of seeing this sequence 
     523                     given the model  
     524 
     525        WARNING: By convention we return -inf for 0 probability events.  
    493526         
    494527        EXAMPLES: 
    495528            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[1,0],[0,1]], [0,1]) 
     
    500533 
    501534        Notice that any sequence starting with 0 can't occur, since 
    502535        the above model always starts in a state that produces 1 with 
    503         probability 1.  By convention log(probability 0) is 1
     536        probability 1.  By convention log(probability 0) is -inf
    504537            sage: a.log_likelihood([0,0]) 
    505             1.0 
     538            -inf 
    506539 
    507540        Here's a special case where each sequence is equally probable. 
    508541            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[1,0],[0,1]], [0.5,0.5]) 
     
    526559            return -float('Inf') 
    527560        return log_p 
    528561 
     562    #################################################################### 
     563    # HMM Problem 2 -- Decoding: Given the complete parameter set that 
     564    # defines the model and an observation sequence seq, determine the 
     565    # best hidden sequence Q.  Use the Viterbi algorithm. 
     566    ####################################################################     
     567    def viterbi(self, seq): 
     568        """ 
     569        HMM Problem 2: Determine a hidden sequence of states that is 
     570        most likely to produce the given sequence seq of obserations. 
     571 
     572        INPUT: 
     573            seq -- sequence of emitted symbols 
     574             
     575        OUTPUT: 
     576            list -- a most probable sequence of hidden states, i.e., the 
     577                    Viterbi path. 
     578            float -- log of the probability that the sequence of hidden 
     579                     states actually produced the given sequence seq. 
     580                     [[TODO: I do not understand precisely what this means.]] 
     581 
     582        EXAMPLES: 
     583            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5]) 
     584            sage: a.viterbi([1,0,0,1,0,0,1,1]) 
     585            ([1, 0, 0, 1, 1, 0, 1, 1], -11.062453224772216) 
     586             
     587        We predict the state sequence when the emissions are 3/4 and 'abc'.  
     588            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc']) 
     589 
     590        Note that state 0 is common below, despite the model trying hard to 
     591        switch to state 1: 
     592            sage: a.viterbi([3/4, 'abc', 'abc'] + [3/4]*10) 
     593            ([0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], -25.299405845367794) 
     594        """ 
     595        if self._emission_symbols_dict: 
     596            seq = [self._emission_symbols_dict[z] for z in seq] 
     597        cdef int* path 
     598        cdef int* O = to_int_array(seq) 
     599        cdef int pathlen 
     600        cdef double log_p 
     601 
     602        path = ghmm_dmodel_viterbi(self.m, O, len(seq), &pathlen, &log_p) 
     603        sage_free(O) 
     604        p = [path[i] for i in range(pathlen)] 
     605        sage_free(path) 
     606 
     607        return p, log_p 
     608 
     609    #################################################################### 
     610    # HMM Problem 3 -- Learning: Given an observation sequence O and 
     611    # the set of states in the HMM, improve the HMM to increase the 
     612    # probability of observing O. 
     613    #################################################################### 
     614    def baum_welch(self, training_seqs, nsteps=None, log_likelihood_cutoff=None): 
     615        pass 
     616## /** Baum-Welch-Algorithm for parameter reestimation (training) in 
     617##     a discrete (discrete output functions) HMM. Scaled version 
     618##     for multiple sequences, alpha and beta matrices are allocated with 
     619##      ighmm_cmatrix_stat_alloc  
     620##     New parameters set directly in hmm (no storage of previous values!). 
     621##     For reference see:   
     622##     Rabiner, L.R.: "`A Tutorial on Hidden {Markov} Models and Selected 
     623##                 Applications in Speech Recognition"', Proceedings of the IEEE, 
     624##      77, no 2, 1989, pp 257--285     
     625##   @return            0/-1 success/error 
     626##   @param mo          initial model 
     627##   @param sq          training sequences 
     628##   */ 
     629## /** Just like reestimate_baum_welch, but you can limit 
     630##     the maximum number of steps 
     631##   @return            0/-1 success/error 
     632##   @param mo          initial model 
     633##   @param sq          training sequences 
     634##   @param max_step    maximal number of Baum-Welch steps 
     635##   @param likelihood_delta minimal improvement in likelihood required for carrying on. Relative value 
     636##   to log likelihood 
     637##   */ 
     638 
     639         
     640         
     641 
     642     
    529643     
    530644 
    531645##################################################################################