Edinburgh Speech Tools  2.1-release
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
EST_SCFG.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Stochastic context free grammars */
38 /* */
39 /*=======================================================================*/
40 #ifndef __EST_SCFG_H__
41 #define __EST_SCFG_H__
42 
43 #include "EST_simplestats.h"
44 #include "EST_rw_status.h"
45 #include "EST_TList.h"
46 #include "siod.h"
47 
48 /** \class EST_bracketed_string
49  \brief This class represents a bracketed string used in training of SCFGs.
50 
51  An object in this class builds an index of valid bracketing of
52  the string, thus offering both a tree like access and direct
53  access to the leafs of the tree. The definition of ``valid
54  bracketing'' is any substring \f[ W_{i,j} \f] that doesn't cross any
55  brackets.
56 */
58  private:
59  int p_length;
60  LISP *symbols;
61  LISP bs;
62  int **valid_spans; // triangular matrix
63  int find_num_nodes(LISP string);
64  int set_leaf_indices(LISP string,int i,LISP *symbols);
65  int num_leafs(LISP l) const;
66  void find_valid(int i,LISP t) const;
67  void init();
68  public:
69  ///
71  ///
72  EST_bracketed_string(LISP string);
73  ///
75 
76  ///
77  void set_bracketed_string(LISP string);
78  ///
79  int length() const {return p_length;}
80  ///
81  LISP string() const { return bs; }
82  /// The nth symbol in the string.
83  const EST_String symbol_at(int i) const
84  { return EST_String(get_c_string(car(symbols[i]))); }
85  /// If a bracketing from i to k is valid in string
86  int valid(int i,int k) const { return valid_spans[i][k]; }
87 
88  ///
89  int operator !=(const EST_bracketed_string &a) const
90  { return (!(this == &a)); }
91  ///
92  friend ostream& operator << (ostream &s, const EST_bracketed_string &a)
93  { (void)a; s << "[a bracketed string]" << endl; return s; }
94 
95 };
96 
98 
99 // Only support Chomsky Normal Form at present
100 enum est_scfg_rtype {est_scfg_unset, est_scfg_binary_rule,
101  est_scfg_unary_rule};
102 
103 /** \class EST_SCFG_Rule
104  \brief A stochastic context free grammar rule.
105 
106  At present only two types of rule are supported:
107  `est\_scfg\_binary\_rule` and `est\_scfg\_unary\_rule`.
108  This is sufficient for the representation of grammars in
109  Chomsky Normal Form. Each rule also has a probability associated
110  with it. Terminals and noterminals are represented as ints using
111  the \ref EST_Discrete s in \ref EST_SCFG to reference the actual
112  alphabets.
113 
114  Although this class includes a "probability" nothing in the rule
115  itself enforces it to be a true probability. It is responsibility
116  of the classes that use this rule to enforce that condition if
117  desired.
118 
119  @author Alan W Black (awb@cstr.ed.ac.uk): October 1997
120 */
122  private:
123  int p_mother;
124  int p_daughter1;
125  int p_daughter2;
126  est_scfg_rtype p_type;
127  double p_prob;
128  public:
129  ///
130  EST_SCFG_Rule() {p_type=est_scfg_unset; p_prob=0;}
131  ///
132  EST_SCFG_Rule(const EST_SCFG_Rule &r)
133  {p_mother = r.p_mother; p_daughter1 = r.p_daughter1;
134  p_daughter2 = r.p_daughter2; p_type=r.p_type; p_prob = r.p_prob;}
135  /// Create a unary rule.
136  EST_SCFG_Rule(double prob,int p,int m);
137  /// Create a binary rule.
138  EST_SCFG_Rule(double prob,int p, int q, int r);
139  /// The rule's probability
140  double prob() const {return p_prob;}
141  /// set the probability
142  void set_prob(double p) { p_prob=p;}
143  /// rule type
144  est_scfg_rtype type() const { return p_type; }
145  ///
146  int mother() const {return p_mother;}
147  /** In a unary rule this is a terminal, in a binary rule it
148  is a nonterminal
149  */
150  int daughter1() const {return p_daughter1;}
151  ///
152  int daughter2() const {return p_daughter2;}
153  ///
154  void set_rule(double prob,int p, int m);
155  ///
156  void set_rule(double prob,int p, int q, int r);
157 };
158 
160 
161 /** \class EST_SCFG
162  \brief A class representing a stochastic context free grammar (SCFG).
163 
164  This class includes the representation of the grammar itself and
165  methods for training and testing it against some corpus.
166 
167  At presnet of grammars in Chomsky Normal Form are supported. That
168  is rules may be binary or unary. If binary the mother an two
169  daughters are nonterminals, if unary the mother must be nonterminal
170  and daughter a terminal symbol.
171 
172  The terminals and nonterminals symbol sets are derived automatically
173  from the LISP representation of the rules at initialization time
174  and are represented as \ref EST_Discrete. The distinguished
175  symbol is assumed to be the first mother of the first rule in
176  the given grammar.
177 
178 */
179 class EST_SCFG {
180  private:
181  EST_Discrete nonterminals;
182  EST_Discrete terminals;
183  int p_distinguished_symbol;
184  // Index of probabilities for binary rules in grammar
185  double ***p_prob_B;
186  // Index of probabilities for unary rules in grammar
187  double **p_prob_U;
188  // Build rule probability caches
189  void rule_prob_cache();
190  // Delete rule probability caches
191  void delete_rule_prob_cache();
192  public:
193  /**@name Constructor and initialisation functions */
194  ///@{
195  EST_SCFG();
196  /// Initialize from a set of rules
197  EST_SCFG(LISP rules);
198  ~EST_SCFG();
199  ///@}
200 
201  /**@name utility functions */
202  ///@{
203  /// Set (or reset) rules from external source after construction
204  void set_rules(LISP rules);
205  /// Return rules as LISP list.
206  LISP get_rules();
207  /// The rules themselves
208  SCFGRuleList rules;
209  int distinguished_symbol() const { return p_distinguished_symbol; }
210  /** Find the terminals and nonterminals in the given grammar, adding
211  them to the appropriate given string lists.
212  */
213  void find_terms_nonterms(EST_StrList &nt, EST_StrList &t,LISP rules);
214  /// Convert nonterminal index to string form
215  EST_String nonterminal(int p) const { return nonterminals.name(p); }
216  /// Convert terminal index to string form
217  EST_String terminal(int m) const { return terminals.name(m); }
218  /// Convert nonterminal string to index
219  int nonterminal(const EST_String &p) const { return nonterminals.name(p); }
220  /// Convert terminal string to index
221  int terminal(const EST_String &m) const { return terminals.name(m); }
222  /// Number of nonterminals
223  int num_nonterminals() const { return nonterminals.length(); }
224  /// Number of terminals
225  int num_terminals() const { return terminals.length(); }
226  /// The rule probability of given binary rule
227  double prob_B(int p, int q, int r) const { return p_prob_B[p][q][r]; }
228  /// The rule probability of given unary rule
229  double prob_U(int p, int m) const { return p_prob_U[p][m]; }
230  /// (re-)set rule probability caches
231  void set_rule_prob_cache();
232  ///@}
233 
234  /**@name file i/o functions */
235  ///@{
236  /// Load grammar from named file
237  EST_read_status load(const EST_String &filename);
238  /// Save current grammar to named file
239  EST_write_status save(const EST_String &filename);
240  ///@}
241 };
242 
243 /** \class EST_SCFG_traintest
244  \brief A class used to train (and test) SCFGs is an extension of
245  \ref EST_SCFG .
246 
247  This offers an implementation of Pereira and Schabes ``Inside-Outside
248  reestimation from partially bracket corpora.'' ACL 1992.
249 
250  A SCFG maybe trained from a corpus (optionally) containing brackets
251  over a series of passes reestimating the grammar probabilities
252  after each pass. This basically extends the \ref EST_SCFG class
253  adding support for a bracket corpus and various indexes for efficient
254  use of the grammar.
255 */
256 class EST_SCFG_traintest : public EST_SCFG {
257  private:
258  /// Index for inside probabilities
259  double ***inside;
260  /// Index for outside probabilities
261  double ***outside;
262  EST_Bcorpus corpus;
263  /// Partial (numerator) for reestimation
264  EST_DVector n;
265  /// Partial (denominator) for reestimation
266  EST_DVector d;
267 
268  /// Calculate inside probability.
269  double f_I_cal(int c, int p, int i, int k);
270  /// Lookup or calculate inside probability.
271  double f_I(int c, int p, int i, int k)
272  { double r;
273  if ((r=inside[p][i][k]) != -1) return r;
274  else return f_I_cal(c,p,i,k); }
275  /// Calculate outside probability.
276  double f_O_cal(int c, int p, int i, int k);
277  /// Lookup or calculate outside probability.
278  double f_O(int c, int p, int i, int k)
279  { double r;
280  if ((r=outside[p][i][k]) != -1) return r;
281  else return f_O_cal(c,p,i,k); }
282  /// Find probability of parse of corpus sentence `c`
283  double f_P(int c);
284  /** Find probability of parse of corpus sentence `c` for
285  nonterminal `p`
286  */
287  double f_P(int c,int p);
288  /// Re-estimate probability of binary rule using inside-outside algorithm
289  void reestimate_rule_prob_B(int c, int ri, int p, int q, int r);
290  /// Re-estimate probability of unary rule using inside-outside algorithm
291  void reestimate_rule_prob_U(int c, int ri, int p, int m);
292  /// Do grammar re-estimation
293  void reestimate_grammar_probs(int passes,
294  int startpass,
295  int checkpoint,
296  int spread,
297  const EST_String &outfile);
298  ///
299  double cross_entropy();
300  /// Initialize the cache for inside/outside values for sentence `c`
301  void init_io_cache(int c,int nt);
302  /// Clear the cache for inside/outside values for sentence `c`
303  void clear_io_cache(int c);
304  public:
307 
308  /** Test the current grammar against the current corpus print summary.
309 
310  Cross entropy measure only is given.
311  */
312  void test_corpus();
313  /** Test the current grammar against the current corpus.
314 
315  Summary includes percentage of cross bracketing accuracy
316  and percentage of fully correct parses.
317  */
318  void test_crossbrackets();
319 
320  /** Load a corpus from the given file.
321 
322  Each sentence in the corpus should be contained in parentheses.
323  Additional parenthesis may be used to denote phrasing within
324  a sentence. The corpus is read using the LISP reader so LISP
325  conventions shold apply, notable single quotes should appear
326  within double quotes.
327  */
328  void load_corpus(const EST_String &filename);
329 
330  /** Train a grammar using the loaded corpus.
331 
332  @param passes the number of training passes desired.
333  @param startpass from which pass to start from
334  @param checkpoint save the grammar every n passes
335  @param spread Percentage of corpus to use on each pass, this cycles through the corpus on each pass.
336  @param outfile Output file name
337  */
338  void train_inout(int passes,
339  int startpass,
340  int checkpoint,
341  int spread,
342  const EST_String &outfile);
343 };
344 
345 /** From a full parse, extract the string with bracketing only.
346 */
347 LISP scfg_bracketing_only(LISP parse);
348 /** Cummulate cross bracketing information between ref and test.
349  */
350 void count_bracket_crossing(const EST_bracketed_string &ref,
351  const EST_bracketed_string &test,
352  EST_SuffStats &vs);
353 
354 #endif
int valid(int i, int k) const
If a bracketing from i to k is valid in string.
Definition: EST_SCFG.h:86
EST_read_status load(const EST_String &filename)
Load grammar from named file.
Definition: EST_SCFG.cc:193
void set_prob(double p)
set the probability
Definition: EST_SCFG.h:142
void find_terms_nonterms(EST_StrList &nt, EST_StrList &t, LISP rules)
Definition: EST_SCFG.cc:91
void set_rule_prob_cache()
(re-)set rule probability caches
Definition: EST_SCFG.cc:256
EST_String nonterminal(int p) const
Convert nonterminal index to string form.
Definition: EST_SCFG.h:215
A class used to train (and test) SCFGs is an extension of EST_SCFG .
Definition: EST_SCFG.h:256
double prob_B(int p, int q, int r) const
The rule probability of given binary rule.
Definition: EST_SCFG.h:227
int terminal(const EST_String &m) const
Convert terminal string to index.
Definition: EST_SCFG.h:221
const EST_String & name(const int n) const
The name given the index.
int nonterminal(const EST_String &p) const
Convert nonterminal string to index.
Definition: EST_SCFG.h:219
A class representing a stochastic context free grammar (SCFG).
Definition: EST_SCFG.h:179
This class represents a bracketed string used in training of SCFGs.
Definition: EST_SCFG.h:57
A stochastic context free grammar rule.
Definition: EST_SCFG.h:121
int num_nonterminals() const
Number of nonterminals.
Definition: EST_SCFG.h:223
void set_rules(LISP rules)
Set (or reset) rules from external source after construction.
Definition: EST_SCFG.cc:120
EST_write_status save(const EST_String &filename)
Save current grammar to named file.
Definition: EST_SCFG.cc:204
est_scfg_rtype type() const
rule type
Definition: EST_SCFG.h:144
EST_String terminal(int m) const
Convert terminal index to string form.
Definition: EST_SCFG.h:217
double prob() const
The rule's probability.
Definition: EST_SCFG.h:140
int num_terminals() const
Number of terminals.
Definition: EST_SCFG.h:225
int daughter1() const
Definition: EST_SCFG.h:150
SCFGRuleList rules
The rules themselves.
Definition: EST_SCFG.h:208
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
Definition: EST_DMatrix.h:118
LISP get_rules()
Return rules as LISP list.
Definition: EST_SCFG.cc:169
void load_corpus(const EST_String &filename)
const EST_String symbol_at(int i) const
The nth symbol in the string.
Definition: EST_SCFG.h:83
void train_inout(int passes, int startpass, int checkpoint, int spread, const EST_String &outfile)
double prob_U(int p, int m) const
The rule probability of given unary rule.
Definition: EST_SCFG.h:229
const int length(void) const
The number of members in the discrete.