49 #include "EST_Wagon.h"
50 #include "EST_cmd_line.h"
52 enum wn_strategy_type {wn_decision_list, wn_decision_tree};
54 static wn_strategy_type wagon_type = wn_decision_tree;
56 static int wagon_main(
int argc,
char **argv);
59 int main(
int argc,
char **argv)
62 wagon_main(argc,argv);
68 static int set_Vertex_Feats(
EST_Track &wgn_VertexFeats,
75 wgn_VertexFeats.
a(0,i) = 0.0;
86 const EST_String ws = (
const char *)token.whitespace();
90 wgn_VertexFeats.
a(0,i) = 1.0;
92 }
else if ((ws ==
",") || (ws ==
""))
94 s = atoi(token.string());
95 wgn_VertexFeats.
a(0,s) = 1.0;
101 e = atoi(token.string());
102 for (i=s; i<=e && i<wgn_VertexFeats.
num_channels(); i++)
103 wgn_VertexFeats.
a(0,i) = 1.0;
106 printf(
"wagon: track_feats invalid: %s at position %d\n",
107 (
const char *)wagon_track_features,
116 static int wagon_main(
int argc,
char **argv)
122 ostream *wgn_coutput = 0;
123 float stepwise_limit = 0;
124 int feats_start=0, feats_end=0;
130 "Summary: CART building program\n"+
131 "-desc <ifile> Field description file\n"+
132 "-data <ifile> Datafile, one vector per line\n"+
133 "-stop <int> {50} Minimum number of examples for leaf nodes\n"+
134 "-test <ifile> Datafile to test tree on\n"+
135 "-frs <float> {10} Float range split, number of partitions to\n"+
136 " split a float feature range into\n"+
137 "-dlist Build a decision list (rather than tree)\n"+
138 "-dtree Build a decision tree (rather than list) default\n"+
139 "-output <ofile> \n"+
140 "-o <ofile> File to save output tree in\n"+
141 "-distmatrix <ifile>\n"+
142 " A distance matrix for clustering\n"+
144 " track for vertex indices\n"+
145 "-track_start <int>\n"+
146 " start channel vertex indices\n"+
147 "-track_end <int>\n"+
148 " end (inclusive) channel for vertex indices\n"+
149 "-track_feats <string>\n"+
150 " Track features to use, comma separated list\n"+
151 " with feature numbers and/or ranges, 0 start\n"+
152 "-unittrack <ifile>\n"+
153 " track for unit start and length in vertex track\n"+
154 "-quiet No questions printed during building\n"+
155 "-verbose Lost of information printing during build\n"+
156 "-predictee <string>\n"+
157 " name of field to predict (default is first field)\n"+
158 "-ignore <string>\n"+
159 " Filename or bracket list of fields to ignore\n"+
160 "-count_field <string>\n"+
161 " Name of field containing count weight for samples\n"+
162 "-stepwise Incrementally find best features\n"+
163 "-swlimit <float> {0.0}\n"+
164 " Percentage necessary improvement for stepwise,\n"+
165 " may be negative.\n"+
166 "-swopt <string> Parameter to optimize for stepwise, for \n"+
167 " classification options are correct or entropy\n"+
168 " for regression options are rmse or correlation\n"+
169 " correct and correlation are the defaults\n"+
170 "-balance <float> For derived stop size, if dataset at node, divided\n"+
171 " by balance is greater than stop it is used as stop\n"+
172 " if balance is 0 (default) always use stop as is.\n"+
173 "-vertex_output <string> Output <mean> or <best> of cluster\n"+
174 "-held_out <int> Percent to hold out for pruning\n"+
175 "-heap <int> {210000}\n"+
176 " Set size of Lisp heap, should not normally need\n"+
177 " to be changed from its default, only with *very*\n"+
178 " large description files (> 1M)\n"+
179 "-noprune No (same class) pruning required\n",
183 wgn_held_out = al.
ival(
"-held_out");
185 wgn_balance = al.
fval(
"-balance");
188 cerr << argv[0] <<
": missing description and/or datafile" << endl;
189 cerr <<
"use -h for description of arguments" << endl;
198 wgn_min_cluster_size = atoi(al.
val(
"-stop"));
202 wgn_predictee_name = al.
val(
"-predictee");
203 if (al.
present(
"-count_field"))
204 wgn_count_field_name = al.
val(
"-count_field");
206 stepwise_limit = al.
fval(
"-swlimit");
208 wgn_float_range_split = atof(al.
val(
"-frs"));
210 wgn_opt_param = al.
val(
"-swopt");
211 if (al.
present(
"-vertex_output"))
212 wgn_vertex_output = al.
val(
"-vertex_output");
216 wgn_oname = al.
val(
"-o");
218 wgn_oname = al.
val(
"-output");
219 wgn_coutput =
new ofstream(wgn_oname);
222 cerr <<
"Wagon: can't open file \"" << wgn_oname <<
223 "\" for output " << endl;
231 if (wgn_DistMatrix.
load(al.
val(
"-distmatrix")) != 0)
233 cerr <<
"Wagon: failed to load Distance Matrix from \"" <<
234 al.
val(
"-distmatrix") <<
"\"\n" << endl;
239 wagon_type = wn_decision_list;
245 siod_init(al.
ival(
"-heap"));
251 ignores = read_from_string(ig);
253 ignores = vload(ig,1);
256 wgn_load_datadescription(al.
val(
"-desc"),ignores);
257 wgn_load_dataset(wgn_dataset,al.
val(
"-data"));
258 if (al.
present(
"-distmatrix") &&
259 (wgn_DistMatrix.
num_rows() < wgn_dataset.length()))
261 cerr <<
"wagon: distance matrix is smaller than number of training elements\n";
266 wgn_VertexTrack.
load(al.
val(
"-track"));
269 wgn_VertexFeats.
a(0,i) = 1.0;
272 if (al.
present(
"-track_start"))
274 feats_start = al.
ival(
"-track_start");
275 if ((feats_start < 0) ||
278 printf(
"wagon: track_start invalid: %d out of %d channels\n",
283 for (i=0; i<feats_start; i++)
284 wgn_VertexFeats.
a(0,i) = 0.0;
290 feats_end = al.
ival(
"-track_end");
291 if ((feats_end < feats_start) ||
294 printf(
"wagon: track_end invalid: %d between start %d out of %d channels\n",
300 for (i=feats_end+1; i<wgn_VertexTrack.
num_channels(); i++)
301 wgn_VertexFeats.
a(0,i) = 0.0;
303 if (al.
present(
"-track_feats"))
306 set_Vertex_Feats(wgn_VertexFeats,wagon_track_features);
318 wgn_UnitTrack.
load(al.
val(
"-unittrack"));
322 wgn_load_dataset(wgn_test_dataset,al.
val(
"-test"));
326 tree = wagon_stepwise(stepwise_limit);
327 else if (wagon_type == wn_decision_tree)
328 tree = wgn_build_tree(score);
329 else if (wagon_type == wn_decision_list)
331 tree = wgn_build_dlist(score,wgn_coutput);
334 cerr <<
"Wagon: unknown operation, not tree or list" << endl;
340 *wgn_coutput << *tree;
341 summary_results(*tree,wgn_coutput);
344 if (wgn_coutput != &cout)
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
EST_TokenStream & get(EST_Token &t)
get next token in stream
float & a(int i, int c=0)
int ival(const EST_String &rkey, int m=1) const
int num_channels() const
return number of channels in track
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
float fval(const EST_String &rkey, int m=1) const
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
int open_string(const EST_String &newbuffer)
open a EST_TokenStream for string rather than a file
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
EST_read_status load(const EST_String name, float ishift=0.0, float startt=0.0)
void resize(int num_frames, int num_channels, bool preserve=1)
EST_read_status load(const EST_String &filename)
Load from file (ascii or binary as defined in file)
const int present(const K &rkey) const
Returns true if key is present.
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
int filepos(void) const
current file position in EST_TokenStream
int num_rows() const
return number of rows