Ticket #3726: sage-3726-part4.patch

File sage-3726-part4.patch, 41.8 kB (added by was, 5 months ago)
  • a/sage/stats/hmm/all.py

    old new  
    1 from hmm import DiscreteHiddenMarkovModel 
     1from hmm  import DiscreteHiddenMarkovModel 
     2from chmm import NormalHiddenMarkovModel 
  • /dev/null

    old new  
     1 
     2from hmm cimport HiddenMarkovModel 
     3 
     4cdef extern from "ghmm/ghmm.h": 
     5    cdef int GHMM_kContinuousHMM 
     6 
     7cdef extern from "ghmm/smodel.h": 
     8    ############################################################# 
     9    # Continuous emission density types 
     10    ############################################################# 
     11    cdef enum ghmm_density_t: 
     12        normal, 
     13        normal_right,   # right tail 
     14        normal_approx, 
     15        normal_left,    # left tail 
     16        uniform, 
     17        binormal, 
     18        multinormal, 
     19        density_number 
     20 
     21    ############################################################# 
     22    # Continuous emission 
     23    ############################################################# 
     24    # Mean and variance types 
     25    cdef union mean_t: 
     26        double val 
     27        double *vec 
     28         
     29    cdef union variance_t: 
     30        double val 
     31        double *mat 
     32 
     33    cdef struct ghmm_c_emission: 
     34        # Flag for density function for each component of the mixture 
     35        #   0: normal density 
     36        #   1: truncated normal (right side) density 
     37        #   2: approximated normal density 
     38        #   3: truncated normal (left side) 
     39        #   4: uniform distribution 
     40        #   6: multivariate normal 
     41        int type 
     42        # dimension > 1 for multivariate normals 
     43        int dimension 
     44        # mean for output functions (pointer to mean vector for multivariate) 
     45        mean_t mean 
     46        variance_t variance 
     47        # pointer to inverse of covariance matrix if multivariate normal; 
     48        # else NULL  
     49        double *sigmainv 
     50        # determinant of covariance matrix for multivariate normal  
     51        double det 
     52        # Cholesky decomposition of covariance matrix A, 
     53        #   if A = G*G' sigmacd only holds G 
     54        double *sigmacd 
     55        # minimum of uniform distribution or left boundary for 
     56        # right-tail gaussians 
     57        double min 
     58        # maximum of uniform distribution or right boundary for 
     59        # left-tail gaussians 
     60        double max 
     61        # if fixed != 0 the parameters of the density are fixed  
     62        int fixed 
     63         
     64    ############################################################# 
     65    # Continous Emission States 
     66    ############################################################# 
     67    ctypedef struct ghmm_cstate: 
     68        # Number of output densities per state  
     69        int M 
     70        # initial prob. 
     71        double pi 
     72        # IDs of successor states 
     73        int *out_id 
     74        # IDs of predecessor states 
     75        int *in_id 
     76        # transition probs to successor states. It is a 
     77        # matrix in case of mult. transition matrices (COS > 1) 
     78        double **out_a 
     79        # transition probs from predecessor states. It is a 
     80        # matrix in case of mult. transition matrices (COS > 1) 
     81        double **in_a 
     82        # number of  successor states 
     83        int out_states 
     84        # number of  predecessor states 
     85        int in_states 
     86        # weight vector for output function components 
     87        double *c 
     88        # flag for fixation of parameter. If fix = 1 do not change parameters of 
     89        # output functions, and if fix = 0 do normal training. Default is 0. 
     90        int fix 
     91        # vector of ghmm_c_emission 
     92        # (type and parameters of output function components) 
     93        ghmm_c_emission *e 
     94        # contains a description of the state (null terminated utf-8) 
     95        char *desc 
     96        # x coordinate position for graph representation plotting 
     97        int xPosition 
     98        # y coordinate position for graph representation plotting 
     99        int yPosition 
     100 
     101    ############################################################# 
     102    # Continous Hidden Markov Model 
     103    ############################################################# 
     104    ctypedef struct ghmm_cmodel 
     105         
     106    ctypedef struct ghmm_cmodel_class_change_context: 
     107        # Names of class change module/function (for python callback) 
     108        char *python_module 
     109        char *python_function 
     110        # index of current sequence  
     111        int k 
     112        # pointer to class function  
     113        int (*get_class) (ghmm_cmodel*, double*, int, int) 
     114        # space for any data necessary for class switch, USER is RESPONSIBLE 
     115        void *user_data 
     116 
     117    ctypedef struct ghmm_cmodel: 
     118        # Number of states 
     119        int N 
     120        # Maximun number of components in the states 
     121        int M 
     122        # Number of dimensions of the emission components. 
     123        # NOTE: All emissions must have the same number of dimensions. 
     124        int dim 
     125        # The ghmm_cmodel includes two continuous models 
     126        #   cos=1: one transition matrix 
     127        #   cos>1: (integer) extension for models with several matrices. 
     128        int cos 
     129        # Prior for a priori probability of the model. 
     130        # -1 means no prior specified (all models have equal prob. a priori.) 
     131        double prior 
     132        # Contains a arbitrary name for the model (null terminated utf-8) 
     133        char * name 
     134        # Contains bit flags for various model extensions such as 
     135        # kSilentStates (see ghmm.h for a complete list). 
     136        int model_type 
     137        # All states of the model. Transition probabilities are part of the states. 
     138        ghmm_cstate *s 
     139        # pointer to a ghmm_cmodel_class_change_context struct 
     140        # necessary for multiple transition classes 
     141        ghmm_cmodel_class_change_context *class_change 
     142 
     143    int ghmm_cmodel_class_change_alloc(ghmm_cmodel * smo) 
     144 
     145    ############################################################# 
     146    # Continous Sequence of states 
     147    # 
     148    # Sequence structure for double sequences.  Contains an array of 
     149    # sequences and corresponding data like sequence labels, sequence 
     150    # weights, etc. Sequences may have different length. 
     151    # multi-dimension sequences are linearized 
     152    ############################################################# 
     153    ctypedef struct ghmm_cseq: 
     154        # sequence array. sequence[i][j] = j-th symbol of i-th seq. 
     155        # sequence[i][D * j] = first dimension of j-th observation of i-th sequence 
     156        double **seq 
     157        # array of sequence length 
     158        int *seq_len 
     159        # array of sequence IDs 
     160        double *seq_id 
     161        # positive! sequence weights.  default is 1 = no weight 
     162        double *seq_w 
     163        # total number of sequences 
     164        long seq_number 
     165        # reserved space for sequences is always >= seq_number 
     166        long capacity 
     167        # sum of sequence weights 
     168        double total_w 
     169        # total number of dimensions  
     170        int dim 
     171        # flags (internal) 
     172        unsigned int flags 
     173 
     174 
     175    ############################################################# 
     176    # Continous Baum-Welch algorithm context 
     177    ############################################################# 
     178    ctypedef struct ghmm_cmodel_baum_welch_context: 
     179        # pointer of continuous model 
     180        ghmm_cmodel *smo 
     181        # sequence pointer 
     182        ghmm_cseq *sqd 
     183        # calculated log likelihood  
     184        double *logp 
     185        # leave reestimation loop if diff. between successive logp values 
     186        # is smaller than eps  
     187        double eps 
     188        # max. no of iterations 
     189        int max_iter 
     190 
     191 
     192    int ghmm_cmodel_free (ghmm_cmodel ** smo) 
     193     
     194         
     195cdef class ContinuousHiddenMarkovModel(HiddenMarkovModel): 
     196    cdef ghmm_cmodel* m 
     197    cdef bint initialized 
     198 
     199cdef class NormalHiddenMarkovModel(ContinuousHiddenMarkovModel): 
     200    pass 
     201 
     202 
  • /dev/null

    old new  
     1 
     2 
     3include "../../ext/stdsage.pxi" 
     4 
     5include "misc.pxi" 
     6 
     7cdef class ContinuousHiddenMarkovModel(HiddenMarkovModel): 
     8    def __init__(self, A, B, pi, name=None): 
     9        self.initialized = False 
     10        HiddenMarkovModel.__init__(self, A, B, pi) 
     11        self.m = <ghmm_cmodel*> safe_malloc(sizeof(ghmm_cmodel)) 
     12        # Set number of states 
     13        self.m.N = self.A.nrows() 
     14 
     15cdef class NormalHiddenMarkovModel(ContinuousHiddenMarkovModel): 
     16    """ 
     17    EXAMPLES: 
     18    The transition matrix: 
     19        sage: A = [[0,1,0],[0.5,0,0.5],[0.3,0.3,0.4]] 
     20 
     21    Parameters of the normal emission distributions in pairs of (mu, sigma): 
     22        sage: B = [(0,1), (-1,0.5), (1,0.2)] 
     23 
     24    The initial probabilities per state: 
     25        sage: pi = [1,0,0] 
     26 
     27    Create the continuous HMM: 
     28        sage: m = hmm.NormalHiddenMarkovModel(A, B, pi); m 
     29    """ 
     30    def __init__(self, A, B, pi, name=None): 
     31        ContinuousHiddenMarkovModel.__init__(self, A, B, pi) 
     32 
     33        # Set number of outputs 
     34        self.m.M = 1 
     35 
     36        # Set the model type to continuous 
     37        self.m.model_type = GHMM_kContinuousHMM 
     38 
     39        # Assign model identifier if specified 
     40        if name is not None:  
     41            name = str(name) 
     42            self.m.name = name 
     43        else: 
     44            self.m.name = NULL 
     45             
     46        # 1 transition matrix 
     47        self.m.cos   =  1 
     48        # Set that no a prior model probabilities are set. 
     49        self.m.prior = -1 
     50        # Dimension is 1 
     51        self.m.dim   =  1 
     52 
     53        # Allocate and initialize states 
     54        cdef ghmm_cstate* states = <ghmm_cstate*> safe_malloc(sizeof(ghmm_cstate) * self.m.N) 
     55        cdef ghmm_cstate* state 
     56        cdef ghmm_c_emission* e 
     57 
     58        for i in range(self.m.N): 
     59            # Parameters of normal distribution 
     60            mu, sigma = self.B[i] 
     61            # Get a reference to the i-th state for convenience of the notation below. 
     62            state = &(states[i]) 
     63            state.desc = NULL 
     64            state.M = 1 
     65            e = <ghmm_c_emission*> safe_malloc(sizeof(ghmm_c_emission)) 
     66            e.type = 0  # normal 
     67            e.dimension = 1 
     68            e.mean.val = mu 
     69            e.variance.val = sigma 
     70            # fixing of emissions is deactivated by default             
     71            e.fixed = 0 
     72            e.sigmacd = NULL 
     73            e.sigmainv = NULL 
     74            state.e = e 
     75            state.c = to_double_array([1.0]) 
     76 
     77        # Set states 
     78        self.m.s = states 
     79 
     80        self.initialized = True 
     81 
     82    def __dealloc__(self): 
     83        return 
     84        if self.initialized: 
     85            ghmm_cmodel_free(&self.m) 
     86 
     87    def __repr__(self): 
     88        """ 
     89        Return string representation of this Continuous HMM. 
     90 
     91        OUTPUT: 
     92            string 
     93 
     94        EXAMPLES: 
     95            sage: m = hmm.NormalHiddenMarkovModel([[0.0,1.0,0],[0.5,0.0,0.5],[0.3,0.3,0.4]], [(0.0,1.0), (-1.0,0.5), (1.0,0.2)], [1,0,0]) 
     96            sage: a.__repr__() 
     97            "Discrete Hidden Markov Model (2 states, 2 outputs)\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']" 
     98        """ 
     99        s = "Normal Hidden Markov Model%s (%s states, %s outputs)"%( 
     100            ' ' + self.m.name if self.m.name else '', 
     101            self.m.N, self.m.M) 
     102        #s += '\nInitial probabilities: %s'%self.initial_probabilities() 
     103        #s += '\nTransition matrix:\n%s'%self.transition_matrix() 
     104        #s += '\nEmission matrix:\n%s'%self.emission_matrix() 
     105        return s 
     106 
  • /dev/null

    old new  
     1from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense 
     2 
     3cdef extern from "ghmm/ghmm.h": 
     4    cdef int GHMM_kSilentStates 
     5    cdef int GHMM_kHigherOrderEmissions 
     6    cdef int GHMM_kDiscreteHMM 
     7 
     8cdef extern from "ghmm/model.h": 
     9    ctypedef struct ghmm_dstate: 
     10        # Initial probability 
     11        double pi 
     12        # Output probability 
     13        double *b 
     14 
     15        # ID's of the following states 
     16        int *out_id 
     17        # ID's of the previous states 
     18        int *in_id 
     19 
     20        # Transition probabilities to successor states. 
     21        double *out_a 
     22        # Transition probabilities from predecessor states. 
     23        double *in_a 
     24 
     25        # Number of successor states 
     26        int out_states 
     27        # Number of precursor states 
     28        int in_states 
     29        # if fix == 1 then output probabilities b stay fixed during the training 
     30        int fix 
     31        # Contains a description of the state (null terminated utf-8) 
     32        char * desc 
     33        # x coordinate position for graph representation plotting 
     34        int xPosition 
     35        # y coordinate position for graph representation plotting 
     36        int yPosition 
     37 
     38    ctypedef struct ghmm_dbackground: 
     39        pass 
     40 
     41    ctypedef struct ghmm_alphabet: 
     42        pass 
     43     
     44    # Discrete HMM's. 
     45    ctypedef struct ghmm_dmodel: 
     46        # Number of states 
     47        int N    
     48        # Number of outputs 
     49        int M    
     50        # Vector of states 
     51        ghmm_dstate *s   
     52        # Contains bit flags for varios model extensions such as 
     53        # kSilentStates, kTiedEmissions (see ghmm.h for a complete list) 
     54        int model_type 
     55        # The a priori probability for the model. 
     56        # A value of -1 indicates that no prior is defined.  
     57        # Note: this is not to be confused with priors on emission distributions. 
     58        double prior 
     59        # An arbitrary name for the model (null terminated utf-8) 
     60        char * name 
     61 
     62        # Flag variables for each state indicating whether it is emitting or not.  
     63        # Note: silent != NULL iff (model_type & kSilentStates) == 1  
     64        int *silent 
     65         
     66        # Int variable for the maximum level of higher order emissions. 
     67        int maxorder 
     68         
     69        # saves the history of emissions as int,  
     70        # the nth-last emission is (emission_history * |alphabet|^n+1) % |alphabet| 
     71        int emission_history 
     72 
     73        # Flag variables for each state indicating whether the states emissions 
     74        # are tied to another state. Groups of tied states are represented 
     75        # by their tie group leader (the lowest numbered member of the group). 
     76        # tied_to[s] == kUntied  : s is not a tied state 
     77        # tied_to[s] == s        : s is a tie group leader 
     78        # tied_to[t] == s        : t is tied to state s (t>s) 
     79        # Note: tied_to != NULL iff (model_type & kTiedEmissions) != 0  */ 
     80        int *tied_to 
     81 
     82        # Note: State store order information of the emissions. 
     83        # Classical HMMS have emission order 0; that is, the emission probability 
     84        # is conditioned only on the state emitting the symbol. 
     85        # For higher order emissions, the emission are conditioned on the state s 
     86        # as well as the previous emission_order observed symbols. 
     87        # The emissions are stored in the state's usual double* b.  
     88        # Note: order != NULL iff (model_type & kHigherOrderEmissions) != 0  */ 
     89        int * order 
     90         
     91        # ghmm_dbackground is a pointer to a 
     92        # ghmm_dbackground structure, which holds (essentially) an 
     93        # array of background distributions (which are just vectors of floating 
     94        # point numbers like state.b). 
     95        # For each state the array background_id indicates which of the background 
     96        # distributions to use in parameter estimation. A value of kNoBackgroundDistribution 
     97        # indicates that none should be used. 
     98        # Note: background_id != NULL iff (model_type & kHasBackgroundDistributions) != 0  
     99        int *background_id 
     100        ghmm_dbackground *bp 
     101 
     102        # Topological ordering of silent states  
     103        #  Condition: topo_order != NULL iff (model_type & kSilentStates) != 0  
     104        int *topo_order 
     105        int topo_order_length 
     106 
     107        # pow_lookup is a array of precomputed powers 
     108        # It contains in the i-th entry M (alphabet size) to the power of i 
     109        # The last entry is maxorder+1. 
     110        int *pow_lookup 
     111 
     112        # Store for each state a class label. Limits the possibly state sequence 
     113        # Note: label != NULL iff (model_type & kLabeledStates) != 0  
     114        int* label 
     115        ghmm_alphabet* label_alphabet 
     116        ghmm_alphabet* alphabet 
     117 
     118 
     119    # Discrete sequences 
     120    ctypedef struct ghmm_dseq: 
     121        # sequence array. sequence[i] [j] = j-th symbol of i-th seq 
     122        int **seq 
     123        # matrix of state ids, can be used to save the viterbi path during sequence generation. 
     124        # ATTENTION: is NOT allocated by ghmm_dseq_calloc  
     125        int **states 
     126        # array of sequence length 
     127        int *seq_len 
     128        # array of state path lengths 
     129        int *states_len 
     130        # array of sequence IDs 
     131        double *seq_id 
     132        # positive sequence weights.  default is 1 = no weight 
     133        double *seq_w 
     134        # total number of sequences  
     135        long seq_number 
     136        # reserved space for sequences is always >= seq_number 
     137        long capacity 
     138        # sum of sequence weights 
     139        double total_w 
     140        # matrix of state labels corresponding to seq 
     141        int **state_labels 
     142        # number of labels for each sequence 
     143        int *state_labels_len 
     144        # internal flags 
     145        unsigned int flags 
     146 
     147    ghmm_dseq *ghmm_dmodel_label_generate_sequences( 
     148        ghmm_dmodel * mo, int seed, int global_len, long seq_number, int Tmax) 
     149    ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len, 
     150                                          long seq_number, int Tmax) 
     151    int ghmm_dseq_free (ghmm_dseq ** sq) 
     152    ghmm_dseq *ghmm_dseq_calloc (long seq_number) 
     153 
     154    int ghmm_dmodel_normalize(ghmm_dmodel *m) 
     155    int ghmm_dmodel_free(ghmm_dmodel **m) 
     156 
     157    # Problem 1: Likelihood 
     158    int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p) 
     159 
     160    # Problem 2: Decoding 
     161    int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p) 
     162 
     163    # Problem 3: Learning 
     164    int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq) 
     165     
     166 
     167cdef class HiddenMarkovModel: 
     168    cdef Matrix_real_double_dense A, B 
     169    cdef list pi 
     170 
     171cdef class DiscreteHiddenMarkovModel(HiddenMarkovModel): 
     172    cdef ghmm_dmodel* m 
     173    cdef bint initialized 
     174    cdef object _emission_symbols, _emission_symbols_dict 
  • a/sage/stats/hmm/hmm.pyx

    old new  
    55 
    66EXAMPLES: 
    77 
     8TODO: 
     9   * make models pickleable (i.e., all parameters should be obtainable 
     10     using functions to make this easy). 
     11   * continuous hmm's 
    812""" 
    913 
    1014import math 
     
    1317from sage.rings.all  import RDF 
    1418from sage.misc.randstate import random 
    1519 
    16 from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense 
    17  
    1820include "../../ext/stdsage.pxi" 
    1921 
    20 cdef extern from "ghmm/ghmm.h": 
    21     cdef int GHMM_kSilentStates 
    22     cdef int GHMM_kHigherOrderEmissions 
    23     cdef int GHMM_kDiscreteHMM 
    24  
    25 cdef extern from "ghmm/model.h": 
    26     ctypedef struct ghmm_dstate: 
    27         # Initial probability 
    28         double pi 
    29         # Output probability 
    30         double *b 
    31  
    32         # ID's of the following states 
    33         int *out_id 
    34         # ID's of the previous states 
    35         int *in_id 
    36  
    37         # Transition probabilities to successor states. 
    38         double *out_a 
    39         # Transition probabilities from predecessor states. 
    40         double *in_a 
    41  
    42         # Number of successor states 
    43         int out_states 
    44         # Number of precursor states 
    45         int in_states 
    46         # if fix == 1 then output probabilities b stay fixed during the training 
    47         int fix 
    48         # Contains a description of the state (null terminated utf-8) 
    49         char * desc 
    50         # x coordinate position for graph representation plotting 
    51         int xPosition 
    52         # y coordinate position for graph representation plotting 
    53         int yPosition 
    54  
    55     ctypedef struct ghmm_dbackground: 
    56         pass 
    57  
    58     ctypedef struct ghmm_alphabet: 
    59         pass 
    60      
    61     # Discrete HMM's. 
    62     ctypedef struct ghmm_dmodel: 
    63         # Number of states 
    64         int N    
    65         # Number of outputs 
    66         int M    
    67         # Vector of states 
    68         ghmm_dstate *s   
    69         # Contains bit flags for varios model extensions such as 
    70         # kSilentStates, kTiedEmissions (see ghmm.h for a complete list) 
    71         int model_type 
    72         # The a priori probability for the model. 
    73         # A value of -1 indicates that no prior is defined.  
    74         # Note: this is not to be confused with priors on emission distributions. 
    75         double prior 
    76         # An arbitrary name for the model (null terminated utf-8) 
    77         char * name 
    78  
    79         # Flag variables for each state indicating whether it is emitting or not.  
    80         # Note: silent != NULL iff (model_type & kSilentStates) == 1  
    81         int *silent 
    82          
    83         # Int variable for the maximum level of higher order emissions. 
    84         int maxorder 
    85          
    86         # saves the history of emissions as int,  
    87         # the nth-last emission is (emission_history * |alphabet|^n+1) % |alphabet| 
    88         int emission_history 
    89  
    90         # Flag variables for each state indicating whether the states emissions 
    91         # are tied to another state. Groups of tied states are represented 
    92         # by their tie group leader (the lowest numbered member of the group). 
    93         # tied_to[s] == kUntied  : s is not a tied state 
    94         # tied_to[s] == s        : s is a tie group leader 
    95         # tied_to[t] == s        : t is tied to state s (t>s) 
    96         # Note: tied_to != NULL iff (model_type & kTiedEmissions) != 0  */ 
    97         int *tied_to 
    98  
    99         # Note: State store order information of the emissions. 
    100         # Classical HMMS have emission order 0; that is, the emission probability 
    101         # is conditioned only on the state emitting the symbol. 
    102         # For higher order emissions, the emission are conditioned on the state s 
    103         # as well as the previous emission_order observed symbols. 
    104         # The emissions are stored in the state's usual double* b.  
    105         # Note: order != NULL iff (model_type & kHigherOrderEmissions) != 0  */ 
    106         int * order 
    107          
    108         # ghmm_dbackground is a pointer to a 
    109         # ghmm_dbackground structure, which holds (essentially) an 
    110         # array of background distributions (which are just vectors of floating 
    111         # point numbers like state.b). 
    112         # For each state the array background_id indicates which of the background 
    113         # distributions to use in parameter estimation. A value of kNoBackgroundDistribution 
    114         # indicates that none should be used. 
    115         # Note: background_id != NULL iff (model_type & kHasBackgroundDistributions) != 0  
    116         int *background_id 
    117         ghmm_dbackground *bp 
    118  
    119         # Topological ordering of silent states  
    120         #  Condition: topo_order != NULL iff (model_type & kSilentStates) != 0  
    121         int *topo_order 
    122         int topo_order_length 
    123  
    124         # pow_lookup is a array of precomputed powers 
    125         # It contains in the i-th entry M (alphabet size) to the power of i 
    126         # The last entry is maxorder+1. 
    127         int *pow_lookup 
    128  
    129         # Store for each state a class label. Limits the possibly state sequence 
    130         # Note: label != NULL iff (model_type & kLabeledStates) != 0  
    131         int* label 
    132         ghmm_alphabet* label_alphabet 
    133         ghmm_alphabet* alphabet 
    134  
    135  
    136     # Discrete sequences 
    137     ctypedef struct ghmm_dseq: 
    138         # sequence array. sequence[i] [j] = j-th symbol of i-th seq 
    139         int **seq 
    140         # matrix of state ids, can be used to save the viterbi path during sequence generation. 
    141         # ATTENTION: is NOT allocated by ghmm_dseq_calloc  
    142         int **states 
    143         # array of sequence length 
    144         int *seq_len 
    145         # array of state path lengths 
    146         int *states_len 
    147         # array of sequence IDs 
    148         double *seq_id 
    149         # positive sequence weights.  default is 1 = no weight 
    150         double *seq_w 
    151         # total number of sequences  
    152         long seq_number 
    153         # reserved space for sequences is always >= seq_number 
    154         long capacity 
    155         # sum of sequence weights 
    156         double total_w 
    157         # matrix of state labels corresponding to seq 
    158         int **state_labels 
    159         # number of labels for each sequence 
    160         int *state_labels_len 
    161         # internal flags 
    162         unsigned int flags 
    163  
    164     ghmm_dseq *ghmm_dmodel_label_generate_sequences( 
    165         ghmm_dmodel * mo, int seed, int global_len, long seq_number, int Tmax) 
    166     ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len, 
    167                                           long seq_number, int Tmax) 
    168  
    169  
    170     int ghmm_dmodel_normalize(ghmm_dmodel *m) 
    171     int ghmm_dmodel_free(ghmm_dmodel **m) 
    172     int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p) 
    173     int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p) 
    174     int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq) 
    175     int ghmm_dmodel_baum_welch_nstep (ghmm_dmodel * m, ghmm_dseq *sq, int max_step, 
    176                                       double likelihood_delta) 
    177      
     22include "misc.pxi" 
    17823 
    17924cdef class HiddenMarkovModel: 
    180     pass 
    181  
    182  
    183 cdef class DiscreteHiddenMarkovModel: 
    184     cdef ghmm_dmodel* m 
    185     cdef bint initialized 
    186     cdef object _emission_symbols, _emission_symbols_dict 
    187     cdef Matrix_real_double_dense A, B 
    188      
    189     def __init__(self, A, B, pi, emission_symbols=None, name=None): 
    190         """ 
    191         INPUTS: 
    192             A  -- square matrix of doubles; the state change probabilities 
    193             B  -- matrix of doubles; emission probabilities 
    194             pi -- list of doubles; probabilities for each initial state 
    195             emission_state -- list of B.ncols() symbols (just used for printing) 
    196             name -- (optional) name of the model 
    197  
    198         EXAMPLES: 
    199         We create a discrete HMM with 2 internal states on an alphabet of 
    200         size 2. 
    201          
    202             sage: a = hmm.DiscreteHiddenMarkovModel([[0.2,0.8],[0.5,0.5]], [[1,0],[0,1]], [0,1]) 
    203              
    204         """ 
     25    def __init__(self, A, B, pi): 
    20526        if not is_Matrix(A): 
    20627            A = matrix(RDF, len(A), len(A[0]), A) 
    20728        if not is_Matrix(B): 
     
    21435            A = A.change_ring(RDF) 
    21536        if B.base_ring() != RDF: 
    21637            B = B.change_ring(RDF) 
    217         if len(pi) != A.nrows(): 
     38 
     39        self.pi = [float(x) for x in pi] 
     40        if len(self.pi) != A.nrows(): 
    21841            raise ValueError, "length of pi must equal number of rows of A" 
    219  
    220         cdef Py_ssize_t i, j, k 
    22142 
    22243        self.A = A 
    22344        self.B = B 
     45 
     46 
     47cdef class DiscreteHiddenMarkovModel(HiddenMarkovModel): 
     48     
     49    def __init__(self, A, B, pi, emission_symbols=None, name=None): 
     50        """ 
     51        INPUTS: 
     52            A  -- square matrix of doubles; the state change probabilities 
     53            B  -- matrix of doubles; emission probabilities 
     54            pi -- list of floats; probabilities for each initial state 
     55            emission_state -- list of B.ncols() symbols (just used for printing) 
     56            name -- (optional) name of the model 
     57 
     58        EXAMPLES: 
     59        We create a discrete HMM with 2 internal states on an alphabet of 
     60        size 2. 
     61         
     62            sage: a = hmm.DiscreteHiddenMarkovModel([[0.2,0.8],[0.5,0.5]], [[1,0],[0,1]], [0,1]) 
     63             
     64        """ 
     65        self.initialized = False 
     66        HiddenMarkovModel.__init__(self, A, B, pi) 
     67 
     68        cdef Py_ssize_t i, j, k 
    22469        self.set_emission_symbols(emission_symbols) 
    22570 
    22671        self.m = <ghmm_dmodel*> safe_malloc(sizeof(ghmm_dmodel)) 
     
    23479        self.m.label_alphabet = NULL; self.m.alphabet = NULL 
    23580 
    23681        # Set number of states and number of outputs 
    237         self.m.N = A.nrows() 
     82        self.m.N = self.A.nrows() 
    23883        self.m.M = len(self._emission_symbols) 
    23984        # Set the model type to discrete 
    24085        self.m.model_type = GHMM_kDiscreteHMM 
    24186 
    24287        # Set that no a prior model probabilities are set. 
    24388        self.m.prior = -1 
     89 
    24490        # Assign model identifier if specified 
    24591        if name is not None:  
    24692            name = str(name) 
     
    256102        tmp_order     = [] 
    257103         
    258104        for i in range(self.m.N): 
    259             v = B[i] 
     105            v = self.B[i] 
    260106 
    261107            # Get a reference to the i-th state for convenience of the notation below. 
    262108            state = &(states[i]) 
     
    274120                tmp_order.append(order) 
    275121            else: 
    276122                raise ValueError, "number of columns (= %s) of B must be a power of the number of emission symbols (= %s)"%( 
    277                     B.ncols(), len(emission_symbols)) 
     123                    self.B.ncols(), len(emission_symbols)) 
    278124 
    279125            state.b = to_double_array(v) 
    280             state.pi = pi[i] 
     126            state.pi = self.pi[i] 
    281127 
    282128            silent_states.append( 1 if sum(v) == 0 else 0) 
    283129 
    284130            # Set out probabilities, i.e., the probabilities that each 
    285131            # symbol will be emitted from this state.  
    286             v = A[i] 
     132            v = self.A[i] 
    287133            nz = v.nonzero_positions() 
    288134            k = len(nz) 
    289135            state.out_states = k 
     
    294140                state.out_a[j]  = v[nz[j]] 
    295141 
    296142            # Set "in" probabilities 
    297             v = A.column(i) 
     143            v = self.A.column(i) 
    298144            nz = v.nonzero_positions() 
    299145            k = len(nz) 
    300146            state.in_states = k 
     
    307153            state.fix = 0 
    308154                 
    309155        self.m.s = states 
    310         self.initialized = True; return 
     156        self.initialized = True 
    311157 
    312         if sum(silent_states) > 0: 
    313             self.m.model_type |= GHMM_kSilentStates 
    314             self.m.silent = to_int_array(silent_states) 
    315  
    316         self.m.maxorder = max(tmp_order) 
    317         if self.m.maxorder > 0: 
    318             self.m.model_type |= GHMM_kHigherOrderEmissions 
    319             self.m.order = to_int_array(tmp_order) 
    320  
    321         # Initialize lookup table for powers of the alphabet size, 
    322         # which speeds up models with higher order states. 
    323         powLookUp = [1] * (self.m.maxorder+2) 
    324         for i in range(1,len(powLookUp)): 
    325             powLookUp[i] = powLookUp[i-1] * self.m.M 
    326         self.m.pow_lookup = to_int_array(powLookUp) 
    327  
    328         self.initialized = True 
     158##         if sum(silent_states) > 0: 
     159##             self.m.model_type |= GHMM_kSilentStates 
     160##             self.m.silent = to_int_array(silent_states) 
     161##         self.m.maxorder = max(tmp_order) 
     162##         if self.m.maxorder > 0: 
     163##             self.m.model_type |= GHMM_kHigherOrderEmissions 
     164##             self.m.order = to_int_array(tmp_order) 
     165##         # Initialize lookup table for powers of the alphabet size, 
     166##         # which speeds up models with higher order states. 
     167##         powLookUp = [1] * (self.m.maxorder+2) 
     168##         for i in range(1,len(powLookUp)): 
     169##             powLookUp[i] = powLookUp[i-1] * self.m.M 
     170##         self.m.pow_lookup = to_int_array(powLookUp) 
     171##         self.initialized = True 
    329172         
    330173    def __dealloc__(self): 
    331174        if self.initialized: 
     
    341184        EXAMPLES: 
    342185            sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc']) 
    343186            sage: a.__repr__() 
    344             "Discrete Hidden Markov Model (2 states, 2 outputs)\n\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']" 
     187            "Discrete Hidden Markov Model (2 states, 2 outputs)\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']" 
    345188        """ 
    346         s = "Discrete Hidden Markov Model%s (%s states, %s outputs)\n"%( 
     189        s = "Discrete Hidden Markov Model%s (%s states, %s outputs)"%( 
    347190            ' ' + self.m.name if self.m.name else '', 
    348191            self.m.N, self.m.M) 
    349192        s += '\nInitial probabilities: %s'%self.initial_probabilities() 
     
    439282        cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, 1, length) 
    440283        cdef Py_ssize_t i 
    441284        v = [d.seq[0][i] for i in range(length)] 
     285        ghmm_dseq_free(&d) 
    442286        if self._emission_symbols_dict: 
    443287            w = self._emission_symbols 
    444288            return [w[i] for i in v] 
     
    510354    #################################################################### 
    511355    def log_likelihood(self, seq): 
    512356        r""" 
    513         HMM Problem 1: Return $\log ( P[seq | model] )$, the log of 
    514         the probability of seeing the given sequence given this model, 
    515         using the forward algorithm and assuming independance of the 
    516         sequence seq. 
     357        HMM Problem 1: Likelihood. Return $\log ( P[seq | model] )$, 
     358        the log of the probability of seeing the given sequence given 
     359        this model, using the forward algorithm and assuming 
     360        independance of the sequence seq. 
    517361 
    518362        INPUT: 
    519363            seq -- a list; sequence of observed emissions of the HMM 
     
    566410    ####################################################################     
    567411    def viterbi(self, seq): 
    568412        """ 
    569         HMM Problem 2: Determine a hidden sequence of states that is 
    570         most likely to produce the given sequence seq of obserations. 
     413        HMM Problem 2: Decoding.  Determine a hidden sequence of 
     414        states that is most likely to produce the given sequence seq 
     415        of obserations. 
    571416 
    572417        INPUT: 
    573418            seq -- sequence of emitted symbols 
     
    612457    # probability of observing O. 
    613458    #################################################################### 
    614459    def baum_welch(self, training_seqs, nsteps=None, log_likelihood_cutoff=None): 
    615         pass 
    616 ## /** Baum-Welch-Algorithm for parameter reestimation (training) in 
    617 ##     a discrete (discrete output functions) HMM. Scaled version 
    618 ##     for multiple sequences, alpha and beta matrices are allocated with 
    619 ##      ighmm_cmatrix_stat_alloc  
    620 ##     New parameters set directly in hmm (no storage of previous values!). 
    621 ##     For reference see:   
    622 ##     Rabiner, L.R.: "`A Tutorial on Hidden {Markov} Models and Selected 
    623 ##                 Applications in Speech Recognition"', Proceedings of the IEEE, 
    624 ##      77, no 2, 1989, pp 257--285     
    625 ##   @return            0/-1 success/error 
    626 ##   @param mo          initial model 
    627 ##   @param sq          training sequences 
    628 ##   */ 
    629 ## /** Just like reestimate_baum_welch, but you can limit 
    630 ##     the maximum number of steps 
    631 ##   @return            0/-1 success/error 
    632 ##   @param mo          initial model 
    633 ##   @param sq          training sequences 
    634 ##   @param max_step    maximal number of Baum-Welch steps 
    635 ##   @param likelihood_delta minimal improvement in likelihood required for carrying on. Relative value 
    636 ##   to log likelihood 
    637 ##   */ 
     460        """ 
     461        HMM Problem 3: Learning.  Given an observation sequence O and 
     462        the set of states in the HMM, improve the HMM using the 
     463        Baum-Welch algorithm to increase the probability of observing O. 
    638464 
     465        INPUT: 
     466            training_seqs -- a list of lists of emission symbols 
     467            nsteps -- integer or None (default: None) maximum number 
     468                      of Baum-Welch steps to take 
     469            log_likehood_cutoff -- positive float or None (default: 
     470                      None); the minimal improvement in likelihood 
     471                      with respect to the last iteration required to 
     472                      continue. Relative value to log likelihood 
     473 
     474        OUTPUT: 
     475            changes the model in places, or raises a RuntimError 
     476            exception on error 
     477 
     478        EXAMPLES: 
     479        We make a model that has two states and is equally likely to output 
     480        0 or 1 in either state and transitions back and forth with equal 
     481        probability. 
     482            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[0.5,0.5],[0.5,0.5]], [0.5,0.5]) 
     483 
     484        We give the model some training data this much more likely to 
     485        be 1 than 0. 
     486            sage: a.baum_welch([[1,1,1,1,0,1], [1,0,1,1,1,1]]) 
     487 
     488        Now the model's emission matrix changes since it is much 
     489        more likely to emit 1 than 0.   
     490            sage: a 
     491            Discrete Hidden Markov Model (2 states, 2 outputs) 
     492            Initial probabilities: [0.5, 0.5] 
     493            Transition matrix: 
     494            [0.5 0.5] 
     495            [0.5 0.5] 
     496            Emission matrix: 
     497            [0.166666666667 0.833333333333] 
     498            [0.166666666667 0.833333333333] 
     499 
     500        Note that 1/6 = 1.666...: 
     501            sage: 1.0/6 
     502            0.166666666666667 
     503 
     504        TESTS: 
     505        We test training with non-default string symbols: 
     506            sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[0.5,0.5],[0.5,0.5]], [0.5,0.5], ['up','down']) 
     507            sage: a.baum_welch([['up','up'], ['down','up']]) 
     508            sage: a 
     509            Discrete Hidden Markov Model (2 states, 2 outputs) 
     510            Initial probabilities: [0.5, 0.5] 
     511            Transition matrix: 
     512            [0.5 0.5] 
     513            [0.5 0.5] 
     514            Emission matrix: 
     515            [0.75 0.25] 
     516            [0.75 0.25] 
     517            Emission symbols: ['up', 'down']         
     518 
     519        NOTE: Training for models including silent states is not yet supported. 
     520 
     521        REFERENCES: 
     522            Rabiner, L.R.: "`A Tutorial on Hidden Markov Models and Selected 
     523            Applications in Speech Recognition"', Proceedings of the IEEE, 
     524            77, no 2, 1989, pp 257--285.  
     525        """ 
     526        if self._emission_symbols_dict: 
     527            seqs = [[self._emission_symbols_dict[z] for z in x] for x in training_seqs] 
     528        else: 
     529            seqs = training_seqs 
     530             
     531        cdef ghmm_dseq* d = malloc_ghmm_dseq(seqs) 
    639532         
     533        if ghmm_dmodel_baum_welch(self.m, d): 
     534            raise RuntimeError, "error running Baum-Welch algorithm" 
    640535         
     536        ghmm_dseq_free(&d) 
    641537 
    642      
    643      
    644  
     538             
    645539################################################################################## 
    646540# Helper Functions 
    647541################################################################################## 
    648542 
    649 cdef double* to_double_array(v) except NULL: 
    650     cdef double x 
    651     cdef double* w = <double*> safe_malloc(sizeof(double)*len(v)) 
    652     cdef Py_ssize_t i = 0 
    653     for x in v: 
    654         w[i] = x 
    655         i += 1 
    656     return w 
    657  
    658 cdef int* to_int_array(v) except NULL: 
    659     cdef int x 
    660     cdef int* w = <int*> safe_malloc(sizeof(int)*len(v)) 
    661     cdef Py_ssize_t i = 0 
    662     for x in v: 
    663         w[i] = x 
    664         i += 1 
    665     return w 
    666  
    667 cdef void* safe_malloc(int bytes) except NULL: 
    668