Ticket #3726: sage-3726-part4.patch
| File sage-3726-part4.patch, 41.8 kB (added by was, 5 months ago) |
|---|
-
a/sage/stats/hmm/all.py
old new 1 from hmm import DiscreteHiddenMarkovModel 1 from hmm import DiscreteHiddenMarkovModel 2 from chmm import NormalHiddenMarkovModel -
/dev/null
old new 1 2 from hmm cimport HiddenMarkovModel 3 4 cdef extern from "ghmm/ghmm.h": 5 cdef int GHMM_kContinuousHMM 6 7 cdef extern from "ghmm/smodel.h": 8 ############################################################# 9 # Continuous emission density types 10 ############################################################# 11 cdef enum ghmm_density_t: 12 normal, 13 normal_right, # right tail 14 normal_approx, 15 normal_left, # left tail 16 uniform, 17 binormal, 18 multinormal, 19 density_number 20 21 ############################################################# 22 # Continuous emission 23 ############################################################# 24 # Mean and variance types 25 cdef union mean_t: 26 double val 27 double *vec 28 29 cdef union variance_t: 30 double val 31 double *mat 32 33 cdef struct ghmm_c_emission: 34 # Flag for density function for each component of the mixture 35 # 0: normal density 36 # 1: truncated normal (right side) density 37 # 2: approximated normal density 38 # 3: truncated normal (left side) 39 # 4: uniform distribution 40 # 6: multivariate normal 41 int type 42 # dimension > 1 for multivariate normals 43 int dimension 44 # mean for output functions (pointer to mean vector for multivariate) 45 mean_t mean 46 variance_t variance 47 # pointer to inverse of covariance matrix if multivariate normal; 48 # else NULL 49 double *sigmainv 50 # determinant of covariance matrix for multivariate normal 51 double det 52 # Cholesky decomposition of covariance matrix A, 53 # if A = G*G' sigmacd only holds G 54 double *sigmacd 55 # minimum of uniform distribution or left boundary for 56 # right-tail gaussians 57 double min 58 # maximum of uniform distribution or right boundary for 59 # left-tail gaussians 60 double max 61 # if fixed != 0 the parameters of the density are fixed 62 int fixed 63 64 ############################################################# 65 # Continous Emission States 66 ############################################################# 67 ctypedef struct ghmm_cstate: 68 # Number of output densities per state 69 int M 70 # initial prob. 71 double pi 72 # IDs of successor states 73 int *out_id 74 # IDs of predecessor states 75 int *in_id 76 # transition probs to successor states. It is a 77 # matrix in case of mult. transition matrices (COS > 1) 78 double **out_a 79 # transition probs from predecessor states. It is a 80 # matrix in case of mult. transition matrices (COS > 1) 81 double **in_a 82 # number of successor states 83 int out_states 84 # number of predecessor states 85 int in_states 86 # weight vector for output function components 87 double *c 88 # flag for fixation of parameter. If fix = 1 do not change parameters of 89 # output functions, and if fix = 0 do normal training. Default is 0. 90 int fix 91 # vector of ghmm_c_emission 92 # (type and parameters of output function components) 93 ghmm_c_emission *e 94 # contains a description of the state (null terminated utf-8) 95 char *desc 96 # x coordinate position for graph representation plotting 97 int xPosition 98 # y coordinate position for graph representation plotting 99 int yPosition 100 101 ############################################################# 102 # Continous Hidden Markov Model 103 ############################################################# 104 ctypedef struct ghmm_cmodel 105 106 ctypedef struct ghmm_cmodel_class_change_context: 107 # Names of class change module/function (for python callback) 108 char *python_module 109 char *python_function 110 # index of current sequence 111 int k 112 # pointer to class function 113 int (*get_class) (ghmm_cmodel*, double*, int, int) 114 # space for any data necessary for class switch, USER is RESPONSIBLE 115 void *user_data 116 117 ctypedef struct ghmm_cmodel: 118 # Number of states 119 int N 120 # Maximun number of components in the states 121 int M 122 # Number of dimensions of the emission components. 123 # NOTE: All emissions must have the same number of dimensions. 124 int dim 125 # The ghmm_cmodel includes two continuous models 126 # cos=1: one transition matrix 127 # cos>1: (integer) extension for models with several matrices. 128 int cos 129 # Prior for a priori probability of the model. 130 # -1 means no prior specified (all models have equal prob. a priori.) 131 double prior 132 # Contains a arbitrary name for the model (null terminated utf-8) 133 char * name 134 # Contains bit flags for various model extensions such as 135 # kSilentStates (see ghmm.h for a complete list). 136 int model_type 137 # All states of the model. Transition probabilities are part of the states. 138 ghmm_cstate *s 139 # pointer to a ghmm_cmodel_class_change_context struct 140 # necessary for multiple transition classes 141 ghmm_cmodel_class_change_context *class_change 142 143 int ghmm_cmodel_class_change_alloc(ghmm_cmodel * smo) 144 145 ############################################################# 146 # Continous Sequence of states 147 # 148 # Sequence structure for double sequences. Contains an array of 149 # sequences and corresponding data like sequence labels, sequence 150 # weights, etc. Sequences may have different length. 151 # multi-dimension sequences are linearized 152 ############################################################# 153 ctypedef struct ghmm_cseq: 154 # sequence array. sequence[i][j] = j-th symbol of i-th seq. 155 # sequence[i][D * j] = first dimension of j-th observation of i-th sequence 156 double **seq 157 # array of sequence length 158 int *seq_len 159 # array of sequence IDs 160 double *seq_id 161 # positive! sequence weights. default is 1 = no weight 162 double *seq_w 163 # total number of sequences 164 long seq_number 165 # reserved space for sequences is always >= seq_number 166 long capacity 167 # sum of sequence weights 168 double total_w 169 # total number of dimensions 170 int dim 171 # flags (internal) 172 unsigned int flags 173 174 175 ############################################################# 176 # Continous Baum-Welch algorithm context 177 ############################################################# 178 ctypedef struct ghmm_cmodel_baum_welch_context: 179 # pointer of continuous model 180 ghmm_cmodel *smo 181 # sequence pointer 182 ghmm_cseq *sqd 183 # calculated log likelihood 184 double *logp 185 # leave reestimation loop if diff. between successive logp values 186 # is smaller than eps 187 double eps 188 # max. no of iterations 189 int max_iter 190 191 192 int ghmm_cmodel_free (ghmm_cmodel ** smo) 193 194 195 cdef class ContinuousHiddenMarkovModel(HiddenMarkovModel): 196 cdef ghmm_cmodel* m 197 cdef bint initialized 198 199 cdef class NormalHiddenMarkovModel(ContinuousHiddenMarkovModel): 200 pass 201 202 -
/dev/null
old new 1 2 3 include "../../ext/stdsage.pxi" 4 5 include "misc.pxi" 6 7 cdef class ContinuousHiddenMarkovModel(HiddenMarkovModel): 8 def __init__(self, A, B, pi, name=None): 9 self.initialized = False 10 HiddenMarkovModel.__init__(self, A, B, pi) 11 self.m = <ghmm_cmodel*> safe_malloc(sizeof(ghmm_cmodel)) 12 # Set number of states 13 self.m.N = self.A.nrows() 14 15 cdef class NormalHiddenMarkovModel(ContinuousHiddenMarkovModel): 16 """ 17 EXAMPLES: 18 The transition matrix: 19 sage: A = [[0,1,0],[0.5,0,0.5],[0.3,0.3,0.4]] 20 21 Parameters of the normal emission distributions in pairs of (mu, sigma): 22 sage: B = [(0,1), (-1,0.5), (1,0.2)] 23 24 The initial probabilities per state: 25 sage: pi = [1,0,0] 26 27 Create the continuous HMM: 28 sage: m = hmm.NormalHiddenMarkovModel(A, B, pi); m 29 """ 30 def __init__(self, A, B, pi, name=None): 31 ContinuousHiddenMarkovModel.__init__(self, A, B, pi) 32 33 # Set number of outputs 34 self.m.M = 1 35 36 # Set the model type to continuous 37 self.m.model_type = GHMM_kContinuousHMM 38 39 # Assign model identifier if specified 40 if name is not None: 41 name = str(name) 42 self.m.name = name 43 else: 44 self.m.name = NULL 45 46 # 1 transition matrix 47 self.m.cos = 1 48 # Set that no a prior model probabilities are set. 49 self.m.prior = -1 50 # Dimension is 1 51 self.m.dim = 1 52 53 # Allocate and initialize states 54 cdef ghmm_cstate* states = <ghmm_cstate*> safe_malloc(sizeof(ghmm_cstate) * self.m.N) 55 cdef ghmm_cstate* state 56 cdef ghmm_c_emission* e 57 58 for i in range(self.m.N): 59 # Parameters of normal distribution 60 mu, sigma = self.B[i] 61 # Get a reference to the i-th state for convenience of the notation below. 62 state = &(states[i]) 63 state.desc = NULL 64 state.M = 1 65 e = <ghmm_c_emission*> safe_malloc(sizeof(ghmm_c_emission)) 66 e.type = 0 # normal 67 e.dimension = 1 68 e.mean.val = mu 69 e.variance.val = sigma 70 # fixing of emissions is deactivated by default 71 e.fixed = 0 72 e.sigmacd = NULL 73 e.sigmainv = NULL 74 state.e = e 75 state.c = to_double_array([1.0]) 76 77 # Set states 78 self.m.s = states 79 80 self.initialized = True 81 82 def __dealloc__(self): 83 return 84 if self.initialized: 85 ghmm_cmodel_free(&self.m) 86 87 def __repr__(self): 88 """ 89 Return string representation of this Continuous HMM. 90 91 OUTPUT: 92 string 93 94 EXAMPLES: 95 sage: m = hmm.NormalHiddenMarkovModel([[0.0,1.0,0],[0.5,0.0,0.5],[0.3,0.3,0.4]], [(0.0,1.0), (-1.0,0.5), (1.0,0.2)], [1,0,0]) 96 sage: a.__repr__() 97 "Discrete Hidden Markov Model (2 states, 2 outputs)\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']" 98 """ 99 s = "Normal Hidden Markov Model%s (%s states, %s outputs)"%( 100 ' ' + self.m.name if self.m.name else '', 101 self.m.N, self.m.M) 102 #s += '\nInitial probabilities: %s'%self.initial_probabilities() 103 #s += '\nTransition matrix:\n%s'%self.transition_matrix() 104 #s += '\nEmission matrix:\n%s'%self.emission_matrix() 105 return s 106 -
/dev/null
old new 1 from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense 2 3 cdef extern from "ghmm/ghmm.h": 4 cdef int GHMM_kSilentStates 5 cdef int GHMM_kHigherOrderEmissions 6 cdef int GHMM_kDiscreteHMM 7 8 cdef extern from "ghmm/model.h": 9 ctypedef struct ghmm_dstate: 10 # Initial probability 11 double pi 12 # Output probability 13 double *b 14 15 # ID's of the following states 16 int *out_id 17 # ID's of the previous states 18 int *in_id 19 20 # Transition probabilities to successor states. 21 double *out_a 22 # Transition probabilities from predecessor states. 23 double *in_a 24 25 # Number of successor states 26 int out_states 27 # Number of precursor states 28 int in_states 29 # if fix == 1 then output probabilities b stay fixed during the training 30 int fix 31 # Contains a description of the state (null terminated utf-8) 32 char * desc 33 # x coordinate position for graph representation plotting 34 int xPosition 35 # y coordinate position for graph representation plotting 36 int yPosition 37 38 ctypedef struct ghmm_dbackground: 39 pass 40 41 ctypedef struct ghmm_alphabet: 42 pass 43 44 # Discrete HMM's. 45 ctypedef struct ghmm_dmodel: 46 # Number of states 47 int N 48 # Number of outputs 49 int M 50 # Vector of states 51 ghmm_dstate *s 52 # Contains bit flags for varios model extensions such as 53 # kSilentStates, kTiedEmissions (see ghmm.h for a complete list) 54 int model_type 55 # The a priori probability for the model. 56 # A value of -1 indicates that no prior is defined. 57 # Note: this is not to be confused with priors on emission distributions. 58 double prior 59 # An arbitrary name for the model (null terminated utf-8) 60 char * name 61 62 # Flag variables for each state indicating whether it is emitting or not. 63 # Note: silent != NULL iff (model_type & kSilentStates) == 1 64 int *silent 65 66 # Int variable for the maximum level of higher order emissions. 67 int maxorder 68 69 # saves the history of emissions as int, 70 # the nth-last emission is (emission_history * |alphabet|^n+1) % |alphabet| 71 int emission_history 72 73 # Flag variables for each state indicating whether the states emissions 74 # are tied to another state. Groups of tied states are represented 75 # by their tie group leader (the lowest numbered member of the group). 76 # tied_to[s] == kUntied : s is not a tied state 77 # tied_to[s] == s : s is a tie group leader 78 # tied_to[t] == s : t is tied to state s (t>s) 79 # Note: tied_to != NULL iff (model_type & kTiedEmissions) != 0 */ 80 int *tied_to 81 82 # Note: State store order information of the emissions. 83 # Classical HMMS have emission order 0; that is, the emission probability 84 # is conditioned only on the state emitting the symbol. 85 # For higher order emissions, the emission are conditioned on the state s 86 # as well as the previous emission_order observed symbols. 87 # The emissions are stored in the state's usual double* b. 88 # Note: order != NULL iff (model_type & kHigherOrderEmissions) != 0 */ 89 int * order 90 91 # ghmm_dbackground is a pointer to a 92 # ghmm_dbackground structure, which holds (essentially) an 93 # array of background distributions (which are just vectors of floating 94 # point numbers like state.b). 95 # For each state the array background_id indicates which of the background 96 # distributions to use in parameter estimation. A value of kNoBackgroundDistribution 97 # indicates that none should be used. 98 # Note: background_id != NULL iff (model_type & kHasBackgroundDistributions) != 0 99 int *background_id 100 ghmm_dbackground *bp 101 102 # Topological ordering of silent states 103 # Condition: topo_order != NULL iff (model_type & kSilentStates) != 0 104 int *topo_order 105 int topo_order_length 106 107 # pow_lookup is a array of precomputed powers 108 # It contains in the i-th entry M (alphabet size) to the power of i 109 # The last entry is maxorder+1. 110 int *pow_lookup 111 112 # Store for each state a class label. Limits the possibly state sequence 113 # Note: label != NULL iff (model_type & kLabeledStates) != 0 114 int* label 115 ghmm_alphabet* label_alphabet 116 ghmm_alphabet* alphabet 117 118 119 # Discrete sequences 120 ctypedef struct ghmm_dseq: 121 # sequence array. sequence[i] [j] = j-th symbol of i-th seq 122 int **seq 123 # matrix of state ids, can be used to save the viterbi path during sequence generation. 124 # ATTENTION: is NOT allocated by ghmm_dseq_calloc 125 int **states 126 # array of sequence length 127 int *seq_len 128 # array of state path lengths 129 int *states_len 130 # array of sequence IDs 131 double *seq_id 132 # positive sequence weights. default is 1 = no weight 133 double *seq_w 134 # total number of sequences 135 long seq_number 136 # reserved space for sequences is always >= seq_number 137 long capacity 138 # sum of sequence weights 139 double total_w 140 # matrix of state labels corresponding to seq 141 int **state_labels 142 # number of labels for each sequence 143 int *state_labels_len 144 # internal flags 145 unsigned int flags 146 147 ghmm_dseq *ghmm_dmodel_label_generate_sequences( 148 ghmm_dmodel * mo, int seed, int global_len, long seq_number, int Tmax) 149 ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len, 150 long seq_number, int Tmax) 151 int ghmm_dseq_free (ghmm_dseq ** sq) 152 ghmm_dseq *ghmm_dseq_calloc (long seq_number) 153 154 int ghmm_dmodel_normalize(ghmm_dmodel *m) 155 int ghmm_dmodel_free(ghmm_dmodel **m) 156 157 # Problem 1: Likelihood 158 int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p) 159 160 # Problem 2: Decoding 161 int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p) 162 163 # Problem 3: Learning 164 int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq) 165 166 167 cdef class HiddenMarkovModel: 168 cdef Matrix_real_double_dense A, B 169 cdef list pi 170 171 cdef class DiscreteHiddenMarkovModel(HiddenMarkovModel): 172 cdef ghmm_dmodel* m 173 cdef bint initialized 174 cdef object _emission_symbols, _emission_symbols_dict -
a/sage/stats/hmm/hmm.pyx
old new 5 5 6 6 EXAMPLES: 7 7 8 TODO: 9 * make models pickleable (i.e., all parameters should be obtainable 10 using functions to make this easy). 11 * continuous hmm's 8 12 """ 9 13 10 14 import math … … 13 17 from sage.rings.all import RDF 14 18 from sage.misc.randstate import random 15 19 16 from sage.matrix.matrix_real_double_dense cimport Matrix_real_double_dense17 18 20 include "../../ext/stdsage.pxi" 19 21 20 cdef extern from "ghmm/ghmm.h": 21 cdef int GHMM_kSilentStates 22 cdef int GHMM_kHigherOrderEmissions 23 cdef int GHMM_kDiscreteHMM 24 25 cdef extern from "ghmm/model.h": 26 ctypedef struct ghmm_dstate: 27 # Initial probability 28 double pi 29 # Output probability 30 double *b 31 32 # ID's of the following states 33 int *out_id 34 # ID's of the previous states 35 int *in_id 36 37 # Transition probabilities to successor states. 38 double *out_a 39 # Transition probabilities from predecessor states. 40 double *in_a 41 42 # Number of successor states 43 int out_states 44 # Number of precursor states 45 int in_states 46 # if fix == 1 then output probabilities b stay fixed during the training 47 int fix 48 # Contains a description of the state (null terminated utf-8) 49 char * desc 50 # x coordinate position for graph representation plotting 51 int xPosition 52 # y coordinate position for graph representation plotting 53 int yPosition 54 55 ctypedef struct ghmm_dbackground: 56 pass 57 58 ctypedef struct ghmm_alphabet: 59 pass 60 61 # Discrete HMM's. 62 ctypedef struct ghmm_dmodel: 63 # Number of states 64 int N 65 # Number of outputs 66 int M 67 # Vector of states 68 ghmm_dstate *s 69 # Contains bit flags for varios model extensions such as 70 # kSilentStates, kTiedEmissions (see ghmm.h for a complete list) 71 int model_type 72 # The a priori probability for the model. 73 # A value of -1 indicates that no prior is defined. 74 # Note: this is not to be confused with priors on emission distributions. 75 double prior 76 # An arbitrary name for the model (null terminated utf-8) 77 char * name 78 79 # Flag variables for each state indicating whether it is emitting or not. 80 # Note: silent != NULL iff (model_type & kSilentStates) == 1 81 int *silent 82 83 # Int variable for the maximum level of higher order emissions. 84 int maxorder 85 86 # saves the history of emissions as int, 87 # the nth-last emission is (emission_history * |alphabet|^n+1) % |alphabet| 88 int emission_history 89 90 # Flag variables for each state indicating whether the states emissions 91 # are tied to another state. Groups of tied states are represented 92 # by their tie group leader (the lowest numbered member of the group). 93 # tied_to[s] == kUntied : s is not a tied state 94 # tied_to[s] == s : s is a tie group leader 95 # tied_to[t] == s : t is tied to state s (t>s) 96 # Note: tied_to != NULL iff (model_type & kTiedEmissions) != 0 */ 97 int *tied_to 98 99 # Note: State store order information of the emissions. 100 # Classical HMMS have emission order 0; that is, the emission probability 101 # is conditioned only on the state emitting the symbol. 102 # For higher order emissions, the emission are conditioned on the state s 103 # as well as the previous emission_order observed symbols. 104 # The emissions are stored in the state's usual double* b. 105 # Note: order != NULL iff (model_type & kHigherOrderEmissions) != 0 */ 106 int * order 107 108 # ghmm_dbackground is a pointer to a 109 # ghmm_dbackground structure, which holds (essentially) an 110 # array of background distributions (which are just vectors of floating 111 # point numbers like state.b). 112 # For each state the array background_id indicates which of the background 113 # distributions to use in parameter estimation. A value of kNoBackgroundDistribution 114 # indicates that none should be used. 115 # Note: background_id != NULL iff (model_type & kHasBackgroundDistributions) != 0 116 int *background_id 117 ghmm_dbackground *bp 118 119 # Topological ordering of silent states 120 # Condition: topo_order != NULL iff (model_type & kSilentStates) != 0 121 int *topo_order 122 int topo_order_length 123 124 # pow_lookup is a array of precomputed powers 125 # It contains in the i-th entry M (alphabet size) to the power of i 126 # The last entry is maxorder+1. 127 int *pow_lookup 128 129 # Store for each state a class label. Limits the possibly state sequence 130 # Note: label != NULL iff (model_type & kLabeledStates) != 0 131 int* label 132 ghmm_alphabet* label_alphabet 133 ghmm_alphabet* alphabet 134 135 136 # Discrete sequences 137 ctypedef struct ghmm_dseq: 138 # sequence array. sequence[i] [j] = j-th symbol of i-th seq 139 int **seq 140 # matrix of state ids, can be used to save the viterbi path during sequence generation. 141 # ATTENTION: is NOT allocated by ghmm_dseq_calloc 142 int **states 143 # array of sequence length 144 int *seq_len 145 # array of state path lengths 146 int *states_len 147 # array of sequence IDs 148 double *seq_id 149 # positive sequence weights. default is 1 = no weight 150 double *seq_w 151 # total number of sequences 152 long seq_number 153 # reserved space for sequences is always >= seq_number 154 long capacity 155 # sum of sequence weights 156 double total_w 157 # matrix of state labels corresponding to seq 158 int **state_labels 159 # number of labels for each sequence 160 int *state_labels_len 161 # internal flags 162 unsigned int flags 163 164 ghmm_dseq *ghmm_dmodel_label_generate_sequences( 165 ghmm_dmodel * mo, int seed, int global_len, long seq_number, int Tmax) 166 ghmm_dseq *ghmm_dmodel_generate_sequences(ghmm_dmodel* mo, int seed, int global_len, 167 long seq_number, int Tmax) 168 169 170 int ghmm_dmodel_normalize(ghmm_dmodel *m) 171 int ghmm_dmodel_free(ghmm_dmodel **m) 172 int ghmm_dmodel_logp(ghmm_dmodel *m, int *O, int len, double *log_p) 173 int *ghmm_dmodel_viterbi (ghmm_dmodel *m, int *O, int len, int *pathlen, double *log_p) 174 int ghmm_dmodel_baum_welch (ghmm_dmodel *m, ghmm_dseq *sq) 175 int ghmm_dmodel_baum_welch_nstep (ghmm_dmodel * m, ghmm_dseq *sq, int max_step, 176 double likelihood_delta) 177 22 include "misc.pxi" 178 23 179 24 cdef class HiddenMarkovModel: 180 pass 181 182 183 cdef class DiscreteHiddenMarkovModel: 184 cdef ghmm_dmodel* m 185 cdef bint initialized 186 cdef object _emission_symbols, _emission_symbols_dict 187 cdef Matrix_real_double_dense A, B 188 189 def __init__(self, A, B, pi, emission_symbols=None, name=None): 190 """ 191 INPUTS: 192 A -- square matrix of doubles; the state change probabilities 193 B -- matrix of doubles; emission probabilities 194 pi -- list of doubles; probabilities for each initial state 195 emission_state -- list of B.ncols() symbols (just used for printing) 196 name -- (optional) name of the model 197 198 EXAMPLES: 199 We create a discrete HMM with 2 internal states on an alphabet of 200 size 2. 201 202 sage: a = hmm.DiscreteHiddenMarkovModel([[0.2,0.8],[0.5,0.5]], [[1,0],[0,1]], [0,1]) 203 204 """ 25 def __init__(self, A, B, pi): 205 26 if not is_Matrix(A): 206 27 A = matrix(RDF, len(A), len(A[0]), A) 207 28 if not is_Matrix(B): … … 214 35 A = A.change_ring(RDF) 215 36 if B.base_ring() != RDF: 216 37 B = B.change_ring(RDF) 217 if len(pi) != A.nrows(): 38 39 self.pi = [float(x) for x in pi] 40 if len(self.pi) != A.nrows(): 218 41 raise ValueError, "length of pi must equal number of rows of A" 219 220 cdef Py_ssize_t i, j, k221 42 222 43 self.A = A 223 44 self.B = B 45 46 47 cdef class DiscreteHiddenMarkovModel(HiddenMarkovModel): 48 49 def __init__(self, A, B, pi, emission_symbols=None, name=None): 50 """ 51 INPUTS: 52 A -- square matrix of doubles; the state change probabilities 53 B -- matrix of doubles; emission probabilities 54 pi -- list of floats; probabilities for each initial state 55 emission_state -- list of B.ncols() symbols (just used for printing) 56 name -- (optional) name of the model 57 58 EXAMPLES: 59 We create a discrete HMM with 2 internal states on an alphabet of 60 size 2. 61 62 sage: a = hmm.DiscreteHiddenMarkovModel([[0.2,0.8],[0.5,0.5]], [[1,0],[0,1]], [0,1]) 63 64 """ 65 self.initialized = False 66 HiddenMarkovModel.__init__(self, A, B, pi) 67 68 cdef Py_ssize_t i, j, k 224 69 self.set_emission_symbols(emission_symbols) 225 70 226 71 self.m = <ghmm_dmodel*> safe_malloc(sizeof(ghmm_dmodel)) … … 234 79 self.m.label_alphabet = NULL; self.m.alphabet = NULL 235 80 236 81 # Set number of states and number of outputs 237 self.m.N = A.nrows()82 self.m.N = self.A.nrows() 238 83 self.m.M = len(self._emission_symbols) 239 84 # Set the model type to discrete 240 85 self.m.model_type = GHMM_kDiscreteHMM 241 86 242 87 # Set that no a prior model probabilities are set. 243 88 self.m.prior = -1 89 244 90 # Assign model identifier if specified 245 91 if name is not None: 246 92 name = str(name) … … 256 102 tmp_order = [] 257 103 258 104 for i in range(self.m.N): 259 v = B[i]105 v = self.B[i] 260 106 261 107 # Get a reference to the i-th state for convenience of the notation below. 262 108 state = &(states[i]) … … 274 120 tmp_order.append(order) 275 121 else: 276 122 raise ValueError, "number of columns (= %s) of B must be a power of the number of emission symbols (= %s)"%( 277 B.ncols(), len(emission_symbols))123 self.B.ncols(), len(emission_symbols)) 278 124 279 125 state.b = to_double_array(v) 280 state.pi = pi[i]126 state.pi = self.pi[i] 281 127 282 128 silent_states.append( 1 if sum(v) == 0 else 0) 283 129 284 130 # Set out probabilities, i.e., the probabilities that each 285 131 # symbol will be emitted from this state. 286 v = A[i]132 v = self.A[i] 287 133 nz = v.nonzero_positions() 288 134 k = len(nz) 289 135 state.out_states = k … … 294 140 state.out_a[j] = v[nz[j]] 295 141 296 142 # Set "in" probabilities 297 v = A.column(i)143 v = self.A.column(i) 298 144 nz = v.nonzero_positions() 299 145 k = len(nz) 300 146 state.in_states = k … … 307 153 state.fix = 0 308 154 309 155 self.m.s = states 310 self.initialized = True ; return156 self.initialized = True 311 157 312 if sum(silent_states) > 0: 313 self.m.model_type |= GHMM_kSilentStates 314 self.m.silent = to_int_array(silent_states) 315 316 self.m.maxorder = max(tmp_order) 317 if self.m.maxorder > 0: 318 self.m.model_type |= GHMM_kHigherOrderEmissions 319 self.m.order = to_int_array(tmp_order) 320 321 # Initialize lookup table for powers of the alphabet size, 322 # which speeds up models with higher order states. 323 powLookUp = [1] * (self.m.maxorder+2) 324 for i in range(1,len(powLookUp)): 325 powLookUp[i] = powLookUp[i-1] * self.m.M 326 self.m.pow_lookup = to_int_array(powLookUp) 327 328 self.initialized = True 158 ## if sum(silent_states) > 0: 159 ## self.m.model_type |= GHMM_kSilentStates 160 ## self.m.silent = to_int_array(silent_states) 161 ## self.m.maxorder = max(tmp_order) 162 ## if self.m.maxorder > 0: 163 ## self.m.model_type |= GHMM_kHigherOrderEmissions 164 ## self.m.order = to_int_array(tmp_order) 165 ## # Initialize lookup table for powers of the alphabet size, 166 ## # which speeds up models with higher order states. 167 ## powLookUp = [1] * (self.m.maxorder+2) 168 ## for i in range(1,len(powLookUp)): 169 ## powLookUp[i] = powLookUp[i-1] * self.m.M 170 ## self.m.pow_lookup = to_int_array(powLookUp) 171 ## self.initialized = True 329 172 330 173 def __dealloc__(self): 331 174 if self.initialized: … … 341 184 EXAMPLES: 342 185 sage: a = hmm.DiscreteHiddenMarkovModel([[0.1,0.9],[0.1,0.9]], [[0.9,0.1],[0.1,0.9]], [0.5,0.5], [3/4, 'abc']) 343 186 sage: a.__repr__() 344 "Discrete Hidden Markov Model (2 states, 2 outputs)\n \nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']"187 "Discrete Hidden Markov Model (2 states, 2 outputs)\nInitial probabilities: [0.5, 0.5]\nTransition matrix:\n[0.1 0.9]\n[0.1 0.9]\nEmission matrix:\n[0.9 0.1]\n[0.1 0.9]\nEmission symbols: [3/4, 'abc']" 345 188 """ 346 s = "Discrete Hidden Markov Model%s (%s states, %s outputs) \n"%(189 s = "Discrete Hidden Markov Model%s (%s states, %s outputs)"%( 347 190 ' ' + self.m.name if self.m.name else '', 348 191 self.m.N, self.m.M) 349 192 s += '\nInitial probabilities: %s'%self.initial_probabilities() … … 439 282 cdef ghmm_dseq *d = ghmm_dmodel_generate_sequences(self.m, seed, length, 1, length) 440 283 cdef Py_ssize_t i 441 284 v = [d.seq[0][i] for i in range(length)] 285 ghmm_dseq_free(&d) 442 286 if self._emission_symbols_dict: 443 287 w = self._emission_symbols 444 288 return [w[i] for i in v] … … 510 354 #################################################################### 511 355 def log_likelihood(self, seq): 512 356 r""" 513 HMM Problem 1: Return $\log ( P[seq | model] )$, the log of514 the probability of seeing the given sequence given this model,515 using the forward algorithm and assuming independance of the516 sequence seq.357 HMM Problem 1: Likelihood. Return $\log ( P[seq | model] )$, 358 the log of the probability of seeing the given sequence given 359 this model, using the forward algorithm and assuming 360 independance of the sequence seq. 517 361 518 362 INPUT: 519 363 seq -- a list; sequence of observed emissions of the HMM … … 566 410 #################################################################### 567 411 def viterbi(self, seq): 568 412 """ 569 HMM Problem 2: Determine a hidden sequence of states that is 570 most likely to produce the given sequence seq of obserations. 413 HMM Problem 2: Decoding. Determine a hidden sequence of 414 states that is most likely to produce the given sequence seq 415 of obserations. 571 416 572 417 INPUT: 573 418 seq -- sequence of emitted symbols … … 612 457 # probability of observing O. 613 458 #################################################################### 614 459 def baum_welch(self, training_seqs, nsteps=None, log_likelihood_cutoff=None): 615 pass 616 ## /** Baum-Welch-Algorithm for parameter reestimation (training) in 617 ## a discrete (discrete output functions) HMM. Scaled version 618 ## for multiple sequences, alpha and beta matrices are allocated with 619 ## ighmm_cmatrix_stat_alloc 620 ## New parameters set directly in hmm (no storage of previous values!). 621 ## For reference see: 622 ## Rabiner, L.R.: "`A Tutorial on Hidden {Markov} Models and Selected 623 ## Applications in Speech Recognition"', Proceedings of the IEEE, 624 ## 77, no 2, 1989, pp 257--285 625 ## @return 0/-1 success/error 626 ## @param mo initial model 627 ## @param sq training sequences 628 ## */ 629 ## /** Just like reestimate_baum_welch, but you can limit 630 ## the maximum number of steps 631 ## @return 0/-1 success/error 632 ## @param mo initial model 633 ## @param sq training sequences 634 ## @param max_step maximal number of Baum-Welch steps 635 ## @param likelihood_delta minimal improvement in likelihood required for carrying on. Relative value 636 ## to log likelihood 637 ## */ 460 """ 461 HMM Problem 3: Learning. Given an observation sequence O and 462 the set of states in the HMM, improve the HMM using the 463 Baum-Welch algorithm to increase the probability of observing O. 638 464 465 INPUT: 466 training_seqs -- a list of lists of emission symbols 467 nsteps -- integer or None (default: None) maximum number 468 of Baum-Welch steps to take 469 log_likehood_cutoff -- positive float or None (default: 470 None); the minimal improvement in likelihood 471 with respect to the last iteration required to 472 continue. Relative value to log likelihood 473 474 OUTPUT: 475 changes the model in places, or raises a RuntimError 476 exception on error 477 478 EXAMPLES: 479 We make a model that has two states and is equally likely to output 480 0 or 1 in either state and transitions back and forth with equal 481 probability. 482 sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[0.5,0.5],[0.5,0.5]], [0.5,0.5]) 483 484 We give the model some training data this much more likely to 485 be 1 than 0. 486 sage: a.baum_welch([[1,1,1,1,0,1], [1,0,1,1,1,1]]) 487 488 Now the model's emission matrix changes since it is much 489 more likely to emit 1 than 0. 490 sage: a 491 Discrete Hidden Markov Model (2 states, 2 outputs) 492 Initial probabilities: [0.5, 0.5] 493 Transition matrix: 494 [0.5 0.5] 495 [0.5 0.5] 496 Emission matrix: 497 [0.166666666667 0.833333333333] 498 [0.166666666667 0.833333333333] 499 500 Note that 1/6 = 1.666...: 501 sage: 1.0/6 502 0.166666666666667 503 504 TESTS: 505 We test training with non-default string symbols: 506 sage: a = hmm.DiscreteHiddenMarkovModel([[0.5,0.5],[0.5,0.5]], [[0.5,0.5],[0.5,0.5]], [0.5,0.5], ['up','down']) 507 sage: a.baum_welch([['up','up'], ['down','up']]) 508 sage: a 509 Discrete Hidden Markov Model (2 states, 2 outputs) 510 Initial probabilities: [0.5, 0.5] 511 Transition matrix: 512 [0.5 0.5] 513 [0.5 0.5] 514 Emission matrix: 515 [0.75 0.25] 516 [0.75 0.25] 517 Emission symbols: ['up', 'down'] 518 519 NOTE: Training for models including silent states is not yet supported. 520 521 REFERENCES: 522 Rabiner, L.R.: "`A Tutorial on Hidden Markov Models and Selected 523 Applications in Speech Recognition"', Proceedings of the IEEE, 524 77, no 2, 1989, pp 257--285. 525 """ 526 if self._emission_symbols_dict: 527 seqs = [[self._emission_symbols_dict[z] for z in x] for x in training_seqs] 528 else: 529 seqs = training_seqs 530 531 cdef ghmm_dseq* d = malloc_ghmm_dseq(seqs) 639 532 533 if ghmm_dmodel_baum_welch(self.m, d): 534 raise RuntimeError, "error running Baum-Welch algorithm" 640 535 536 ghmm_dseq_free(&d) 641 537 642 643 644 538 645 539 ################################################################################## 646 540 # Helper Functions 647 541 ################################################################################## 648 542 649 cdef double* to_double_array(v) except NULL: 650 cdef double x 651 cdef double* w = <double*> safe_malloc(sizeof(double)*len(v)) 652 cdef Py_ssize_t i = 0 653 for x in v: 654 w[i] = x 655 i += 1 656 return w 657 658 cdef int* to_int_array(v) except NULL: 659 cdef int x 660 cdef int* w = <int*> safe_malloc(sizeof(int)*len(v)) 661 cdef Py_ssize_t i = 0 662 for x in v: 663 w[i] = x 664 i += 1 665 return w 666 667 cdef void* safe_malloc(int bytes) except NULL: 668