Drizzled Public API Documentation

sel_arg.h

00001 /* -*- mode: c++; c-basic-offset: 2; indent-tabs-mode: nil; -*-
00002  *  vim:expandtab:shiftwidth=2:tabstop=2:smarttab:
00003  *
00004  *  Copyright (C) 2008-2009 Sun Microsystems, Inc.
00005  *
00006  *  This program is free software; you can redistribute it and/or modify
00007  *  it under the terms of the GNU General Public License as published by
00008  *  the Free Software Foundation; version 2 of the License.
00009  *
00010  *  This program is distributed in the hope that it will be useful,
00011  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  *  GNU General Public License for more details.
00014  *
00015  *  You should have received a copy of the GNU General Public License
00016  *  along with this program; if not, write to the Free Software
00017  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00018  */
00019 
00020 #pragma once
00021 
00022 namespace drizzled {
00023 namespace optimizer {
00024 
00025 class RangeParameter;
00026 
00027 /*
00028   A construction block of the SEL_ARG-graph.
00029 
00030   The following description only covers graphs of SEL_ARG objects with
00031   sel_arg->type==KEY_RANGE:
00032 
00033   One SEL_ARG object represents an "elementary interval" in form
00034 
00035       min_value <=?  table.keypartX  <=? max_value
00036 
00037   The interval is a non-empty interval of any kind: with[out] minimum/maximum
00038   bound, [half]open/closed, single-point interval, etc.
00039 
00040   1. SEL_ARG GRAPH STRUCTURE
00041 
00042   SEL_ARG objects are linked together in a graph. The meaning of the graph
00043   is better demostrated by an example:
00044 
00045      tree->keys[i]
00046       |
00047       |             $              $
00048       |    part=1   $     part=2   $    part=3
00049       |             $              $
00050       |  +-------+  $   +-------+  $   +--------+
00051       |  | kp1<1 |--$-->| kp2=5 |--$-->| kp3=10 |
00052       |  +-------+  $   +-------+  $   +--------+
00053       |      |      $              $       |
00054       |      |      $              $   +--------+
00055       |      |      $              $   | kp3=12 |
00056       |      |      $              $   +--------+
00057       |  +-------+  $              $
00058       \->| kp1=2 |--$--------------$-+
00059          +-------+  $              $ |   +--------+
00060              |      $              $  ==>| kp3=11 |
00061          +-------+  $              $ |   +--------+
00062          | kp1=3 |--$--------------$-+       |
00063          +-------+  $              $     +--------+
00064              |      $              $     | kp3=14 |
00065             ...     $              $     +--------+
00066 
00067   The entire graph is partitioned into "interval lists".
00068 
00069   An interval list is a sequence of ordered disjoint intervals over the same
00070   key part. SEL_ARG are linked via "next" and "prev" pointers. Additionally,
00071   all intervals in the list form an RB-tree, linked via left/right/parent
00072   pointers. The RB-tree root SEL_ARG object will be further called "root of the
00073   interval list".
00074 
00075     In the example pic, there are 4 interval lists:
00076     "kp<1 OR kp1=2 OR kp1=3", "kp2=5", "kp3=10 OR kp3=12", "kp3=11 OR kp3=13".
00077     The vertical lines represent SEL_ARG::next/prev pointers.
00078 
00079   In an interval list, each member X may have SEL_ARG::next_key_part pointer
00080   pointing to the root of another interval list Y. The pointed interval list
00081   must cover a key part with greater number (i.e. Y->part > X->part).
00082 
00083     In the example pic, the next_key_part pointers are represented by
00084     horisontal lines.
00085 
00086   2. SEL_ARG GRAPH SEMANTICS
00087 
00088   It represents a condition in a special form (we don't have a name for it ATM)
00089   The SEL_ARG::next/prev is "OR", and next_key_part is "AND".
00090 
00091   For example, the picture represents the condition in form:
00092    (kp1 < 1 AND kp2=5 AND (kp3=10 OR kp3=12)) OR
00093    (kp1=2 AND (kp3=11 OR kp3=14)) OR
00094    (kp1=3 AND (kp3=11 OR kp3=14))
00095 
00096 
00097   3. SEL_ARG GRAPH USE
00098 
00099   Use get_mm_tree() to construct SEL_ARG graph from WHERE condition.
00100   Then walk the SEL_ARG graph and get a list of dijsoint ordered key
00101   intervals (i.e. intervals in form
00102 
00103    (constA1, .., const1_K) < (keypart1,.., keypartK) < (constB1, .., constB_K)
00104 
00105   Those intervals can be used to access the index. The uses are in:
00106    - check_quick_select() - Walk the SEL_ARG graph and find an estimate of
00107                             how many table records are contained within all
00108                             intervals.
00109    - get_quick_select()   - Walk the SEL_ARG, materialize the key intervals,
00110                             and create QuickRangeSelect object that will
00111                             read records within these intervals.
00112 
00113   4. SPACE COMPLEXITY NOTES
00114 
00115     SEL_ARG graph is a representation of an ordered disjoint sequence of
00116     intervals over the ordered set of index tuple values.
00117 
00118     For multi-part keys, one can construct a WHERE expression such that its
00119     list of intervals will be of combinatorial size. Here is an example:
00120 
00121       (keypart1 IN (1,2, ..., n1)) AND
00122       (keypart2 IN (1,2, ..., n2)) AND
00123       (keypart3 IN (1,2, ..., n3))
00124 
00125     For this WHERE clause the list of intervals will have n1*n2*n3 intervals
00126     of form
00127 
00128       (keypart1, keypart2, keypart3) = (k1, k2, k3), where 1 <= k{i} <= n{i}
00129 
00130     SEL_ARG graph structure aims to reduce the amount of required space by
00131     "sharing" the elementary intervals when possible (the pic at the
00132     beginning of this comment has examples of such sharing). The sharing may
00133     prevent combinatorial blowup:
00134 
00135       There are WHERE clauses that have combinatorial-size interval lists but
00136       will be represented by a compact SEL_ARG graph.
00137       Example:
00138         (keypartN IN (1,2, ..., n1)) AND
00139         ...
00140         (keypart2 IN (1,2, ..., n2)) AND
00141         (keypart1 IN (1,2, ..., n3))
00142 
00143     but not in all cases:
00144 
00145     - There are WHERE clauses that do have a compact SEL_ARG-graph
00146       representation but get_mm_tree() and its callees will construct a
00147       graph of combinatorial size.
00148       Example:
00149         (keypart1 IN (1,2, ..., n1)) AND
00150         (keypart2 IN (1,2, ..., n2)) AND
00151         ...
00152         (keypartN IN (1,2, ..., n3))
00153 
00154     - There are WHERE clauses for which the minimal possible SEL_ARG graph
00155       representation will have combinatorial size.
00156       Example:
00157         By induction: Let's take any interval on some keypart in the middle:
00158 
00159            kp15=c0
00160 
00161         Then let's AND it with this interval 'structure' from preceding and
00162         following keyparts:
00163 
00164           (kp14=c1 AND kp16=c3) OR keypart14=c2) (*)
00165 
00166         We will obtain this SEL_ARG graph:
00167 
00168              kp14     $      kp15      $      kp16
00169                       $                $
00170          +---------+  $   +---------+  $   +---------+
00171          | kp14=c1 |--$-->| kp15=c0 |--$-->| kp16=c3 |
00172          +---------+  $   +---------+  $   +---------+
00173               |       $                $
00174          +---------+  $   +---------+  $
00175          | kp14=c2 |--$-->| kp15=c0 |  $
00176          +---------+  $   +---------+  $
00177                       $                $
00178 
00179        Note that we had to duplicate "kp15=c0" and there was no way to avoid
00180        that.
00181        The induction step: AND the obtained expression with another "wrapping"
00182        expression like (*).
00183        When the process ends because of the limit on max. number of keyparts
00184        we'll have:
00185 
00186          WHERE clause length  is O(3*#max_keyparts)
00187          SEL_ARG graph size   is O(2^(#max_keyparts/2))
00188 
00189        (it is also possible to construct a case where instead of 2 in 2^n we
00190         have a bigger constant, e.g. 4, and get a graph with 4^(31/2)= 2^31
00191         nodes)
00192 
00193     We avoid consuming too much memory by setting a limit on the number of
00194     SEL_ARG object we can construct during one range analysis invocation.
00195 */
00196 
00197 class SEL_ARG :public memory::SqlAlloc
00198 {
00199 public:
00200   uint8_t min_flag,max_flag,maybe_flag;
00201   uint8_t part;         // Which key part
00202   uint8_t maybe_null;
00203   /*
00204     Number of children of this element in the RB-tree, plus 1 for this
00205     element itself.
00206   */
00207   uint16_t elements;
00208   /*
00209     Valid only for elements which are RB-tree roots: Number of times this
00210     RB-tree is referred to (it is referred by SEL_ARG::next_key_part or by
00211     SEL_TREE::keys[i] or by a temporary SEL_ARG* variable)
00212   */
00213   ulong use_count;
00214 
00215   Field *field;
00216   unsigned char *min_value,*max_value;      // Pointer to range
00217 
00218   /*
00219     eq_tree() requires that left == right == 0 if the type is MAYBE_KEY.
00220    */
00221   SEL_ARG *left,*right;   /* R-B tree children */
00222   SEL_ARG *next,*prev;    /* Links for bi-directional interval list */
00223   SEL_ARG *parent;        /* R-B tree parent */
00224   SEL_ARG *next_key_part;
00225   enum leaf_color { BLACK,RED } color;
00226   enum Type { IMPOSSIBLE, MAYBE, MAYBE_KEY, KEY_RANGE } type;
00227 
00228   enum 
00229   { 
00230     MAX_SEL_ARGS = 16000 
00231   };
00232 
00233   SEL_ARG() {}
00234 
00235   SEL_ARG(SEL_ARG &);
00236 
00237   SEL_ARG(Field *,const unsigned char *, const unsigned char *);
00238 
00239   SEL_ARG(Field *field, 
00240           uint8_t part, 
00241           unsigned char *min_value, 
00242           unsigned char *max_value,
00243           uint8_t min_flag, 
00244           uint8_t max_flag, 
00245           uint8_t maybe_flag);
00246 
00247   SEL_ARG(enum Type type_arg)
00248     :
00249       min_flag(0),
00250       elements(1),
00251       use_count(1),
00252       left(0),
00253       right(0),
00254       next_key_part(0),
00255       color(BLACK), 
00256       type(type_arg)
00257   {}
00258 
00259   int size() const
00260   {
00261     return elements;
00262   }
00263 
00264   inline bool is_same(SEL_ARG *arg)
00265   {
00266     if (type != arg->type || part != arg->part)
00267       return 0;
00268     if (type != KEY_RANGE)
00269       return 1;
00270     return (cmp_min_to_min(arg) == 0 && cmp_max_to_max(arg) == 0);
00271   }
00272 
00273   inline void merge_flags(SEL_ARG *arg) 
00274   { 
00275     maybe_flag|= arg->maybe_flag; 
00276   }
00277 
00278   inline void maybe_smaller() 
00279   { 
00280     maybe_flag= 1; 
00281   }
00282 
00283   /* Return true iff it's a single-point null interval */
00284   inline bool is_null_interval() 
00285   { 
00286     return (maybe_null && max_value[0] == 1);
00287   }
00288 
00289   inline int cmp_min_to_min(SEL_ARG *arg)
00290   {
00291     return sel_cmp(field,min_value, arg->min_value, min_flag, arg->min_flag);
00292   }
00293 
00294   inline int cmp_min_to_max(SEL_ARG *arg)
00295   {
00296     return sel_cmp(field,min_value, arg->max_value, min_flag, arg->max_flag);
00297   }
00298 
00299   inline int cmp_max_to_max(SEL_ARG *arg)
00300   {
00301     return sel_cmp(field,max_value, arg->max_value, max_flag, arg->max_flag);
00302   }
00303 
00304   inline int cmp_max_to_min(SEL_ARG *arg)
00305   {
00306     return sel_cmp(field,max_value, arg->min_value, max_flag, arg->min_flag);
00307   }
00308 
00309   SEL_ARG *clone_and(SEL_ARG *arg);
00310 
00311   SEL_ARG *clone_first(SEL_ARG *arg);
00312 
00313   SEL_ARG *clone_last(SEL_ARG *arg);
00314 
00315   SEL_ARG *clone(RangeParameter *param, SEL_ARG *new_parent, SEL_ARG **next);
00316 
00317   bool copy_min(SEL_ARG *arg);
00318 
00319   bool copy_max(SEL_ARG *arg);
00320 
00321   void copy_min_to_min(SEL_ARG *arg);
00322 
00323   void copy_min_to_max(SEL_ARG *arg);
00324 
00325   void copy_max_to_min(SEL_ARG *arg);
00326 
00327   /* returns a number of keypart values (0 or 1) appended to the key buffer */
00328   int store_min(uint32_t length, unsigned char **min_key, uint32_t min_key_flag);
00329 
00330   /* returns a number of keypart values (0 or 1) appended to the key buffer */
00331   int store_max(uint32_t length, unsigned char **max_key, uint32_t max_key_flag);
00332 
00333   /* returns a number of keypart values appended to the key buffer */
00334   int store_min_key(KEY_PART *key, unsigned char **range_key, uint32_t *range_key_flag);
00335 
00336   /* returns a number of keypart values appended to the key buffer */
00337   int store_max_key(KEY_PART *key, unsigned char **range_key, uint32_t *range_key_flag);
00338 
00339   SEL_ARG *insert(SEL_ARG *key);
00340   SEL_ARG *tree_delete(SEL_ARG *key);
00341   SEL_ARG *find_range(SEL_ARG *key);
00342   SEL_ARG *rb_insert(SEL_ARG *leaf);
00343 
00344   friend SEL_ARG *rb_delete_fixup(SEL_ARG *root,SEL_ARG *key, SEL_ARG *par);
00345 
00346   SEL_ARG *first();
00347 
00348   SEL_ARG *last();
00349 
00350   void make_root();
00351 
00352   inline bool simple_key()
00353   {
00354     return (! next_key_part && elements == 1);
00355   }
00356 
00357   void increment_use_count(long count)
00358   {
00359     if (next_key_part)
00360     {
00361       next_key_part->use_count+= count;
00362       count*= (next_key_part->use_count - count);
00363       for (SEL_ARG *pos= next_key_part->first(); pos; pos= pos->next)
00364         if (pos->next_key_part)
00365           pos->increment_use_count(count);
00366     }
00367   }
00368 
00369   void free_tree()
00370   {
00371     for (SEL_ARG *pos= first(); pos; pos= pos->next)
00372       if (pos->next_key_part)
00373       {
00374         pos->next_key_part->use_count--;
00375         pos->next_key_part->free_tree();
00376       }
00377   }
00378 
00379   inline SEL_ARG **parent_ptr()
00380   {
00381     return parent->left == this ? &parent->left : &parent->right;
00382   }
00383 
00384 
00385   /*
00386     Check if this SEL_ARG object represents a single-point interval
00387 
00388     SYNOPSIS
00389       is_singlepoint()
00390 
00391     DESCRIPTION
00392       Check if this SEL_ARG object (not tree) represents a single-point
00393       interval, i.e. if it represents a "keypart = const" or
00394       "keypart IS NULL".
00395 
00396     RETURN
00397       true   This SEL_ARG object represents a singlepoint interval
00398       false  Otherwise
00399   */
00400 
00401   bool is_singlepoint()
00402   {
00403     /*
00404       Check for NEAR_MIN ("strictly less") and NO_MIN_RANGE (-inf < field)
00405       flags, and the same for right edge.
00406     */
00407     if (min_flag || max_flag)
00408       return false;
00409     unsigned char *min_val= min_value;
00410     unsigned char *max_val= max_value;
00411 
00412     if (maybe_null)
00413     {
00414       /* First byte is a NULL value indicator */
00415       if (*min_val != *max_val)
00416         return false;
00417 
00418       if (*min_val)
00419         return true; /* This "x IS NULL" */
00420       min_val++;
00421       max_val++;
00422     }
00423     return ! field->key_cmp(min_val, max_val);
00424   }
00425 
00426   SEL_ARG *clone_tree(RangeParameter *param);
00427 
00428 private:
00429 
00430   /*
00431      Check if a compare is ok, when one takes ranges in account
00432      Returns -2 or 2 if the ranges where 'joined' like  < 2 and >= 2
00433    */
00434   int sel_cmp(Field *in_field, 
00435               unsigned char *a, 
00436               unsigned char *b, 
00437               uint8_t a_flag,
00438               uint8_t b_flag)
00439   {
00440     int cmp= 0;
00441     /* First check if there was a compare to a min or max element */
00442     if (a_flag & (NO_MIN_RANGE | NO_MAX_RANGE))
00443     {
00444       if ((a_flag & (NO_MIN_RANGE | NO_MAX_RANGE)) ==
00445           (b_flag & (NO_MIN_RANGE | NO_MAX_RANGE)))
00446         return 0;
00447       return (a_flag & NO_MIN_RANGE) ? -1 : 1;
00448     }
00449     if (b_flag & (NO_MIN_RANGE | NO_MAX_RANGE))
00450       return (b_flag & NO_MIN_RANGE) ? 1 : -1;
00451 
00452     if (in_field->real_maybe_null())      // If null is part of key
00453     {
00454       if (*a != *b)
00455       {
00456         return *a ? -1 : 1;
00457       }
00458       if (*a)
00459         goto end;         // NULL where equal
00460       a++; b++;         // Skip NULL marker
00461     }
00462     cmp= in_field->key_cmp(a , b);
00463     if (cmp) return cmp < 0 ? -1 : 1;   // The values differed
00464 
00465     // Check if the compared equal arguments was defined with open/closed range
00466 end:
00467     if (a_flag & (NEAR_MIN | NEAR_MAX))
00468     {
00469       if ((a_flag & (NEAR_MIN | NEAR_MAX)) == (b_flag & (NEAR_MIN | NEAR_MAX)))
00470         return 0;
00471       if (! (b_flag & (NEAR_MIN | NEAR_MAX)))
00472         return (a_flag & NEAR_MIN) ? 2 : -2;
00473       return (a_flag & NEAR_MIN) ? 1 : -1;
00474     }
00475     if (b_flag & (NEAR_MIN | NEAR_MAX))
00476       return (b_flag & NEAR_MIN) ? -2 : 2;
00477     return 0;         // The elements where equal
00478   }
00479 
00480   
00481 };
00482 
00483 SEL_ARG *rb_delete_fixup(SEL_ARG *root,
00484                          SEL_ARG *key,
00485                          SEL_ARG *par);
00486 
00487 extern SEL_ARG null_element;
00488 
00489 } /* namespace optimizer */
00490 
00491 } /* namespace drizzled */
00492