Drizzled Public API Documentation

unique.cc

00001 /* Copyright (C) 2001 MySQL AB
00002 
00003    This program is free software; you can redistribute it and/or modify
00004    it under the terms of the GNU General Public License as published by
00005    the Free Software Foundation; version 2 of the License.
00006 
00007    This program is distributed in the hope that it will be useful,
00008    but WITHOUT ANY WARRANTY; without even the implied warranty of
00009    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00010    GNU General Public License for more details.
00011 
00012    You should have received a copy of the GNU General Public License
00013    along with this program; if not, write to the Free Software
00014    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
00015 
00016 /*
00017   Function to handle quick removal of duplicates
00018   This code is used when doing multi-table deletes to find the rows in
00019   reference tables that needs to be deleted.
00020 
00021   The basic idea is as follows:
00022 
00023   Store first all strings in a binary tree, ignoring duplicates.
00024 
00025   The unique entries will be returned in sort order, to ensure that we do the
00026   deletes in disk order.
00027 */
00028 
00029 #include <config.h>
00030 
00031 #include <math.h>
00032 
00033 #include <queue>
00034 
00035 #include <drizzled/sql_sort.h>
00036 #include <drizzled/session.h>
00037 #include <drizzled/sql_list.h>
00038 #include <drizzled/internal/iocache.h>
00039 #include <drizzled/unique.h>
00040 
00041 #if defined(CMATH_NAMESPACE)
00042 using namespace CMATH_NAMESPACE;
00043 #endif
00044 
00045 using namespace std;
00046 
00047 namespace drizzled
00048 {
00049 
00050 int unique_write_to_ptrs(unsigned char* key,
00051                          uint32_t, Unique *unique)
00052 {
00053   memcpy(unique->record_pointers, key, unique->size);
00054   unique->record_pointers+=unique->size;
00055   return 0;
00056 }
00057 
00058 Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg,
00059          uint32_t size_arg, size_t max_in_memory_size_arg)
00060   : max_in_memory_size(max_in_memory_size_arg),
00061     size(size_arg),
00062     elements(0)
00063 {
00064   // Second element is max size for memory use in unique sort
00065   init_tree(&tree, 0, 0, size, comp_func, false,
00066             NULL, comp_func_fixed_arg);
00067 }
00068 
00069 
00070 /*
00071   Calculate log2(n!)
00072 
00073   NOTES
00074     Stirling's approximate formula is used:
00075 
00076       n! ~= sqrt(2*M_PI*n) * (n/M_E)^n
00077 
00078     Derivation of formula used for calculations is as follows:
00079 
00080     log2(n!) = log(n!)/log(2) = log(sqrt(2*M_PI*n)*(n/M_E)^n) / log(2) =
00081 
00082       = (log(2*M_PI*n)/2 + n*log(n/M_E)) / log(2).
00083 */
00084 
00085 inline double log2_n_fact(double x)
00086 {
00087   return (log(2*M_PI*x)/2 + x*log(x/M_E)) / M_LN2;
00088 }
00089 
00090 
00091 /*
00092   Calculate cost of using Unique for processing nkeys elements of size
00093   key_size using max_in_memory_size memory.
00094 
00095   SYNOPSIS
00096     Unique::get_use_cost()
00097       buffer    space for temporary data, use Unique::get_cost_calc_buff_size
00098                 to get # bytes needed.
00099       nkeys     #of elements in Unique
00100       key_size  size of each elements in bytes
00101       max_in_memory_size amount of memory Unique will be allowed to use
00102 
00103   RETURN
00104     Cost in disk seeks.
00105 
00106   NOTES
00107     cost(using_unqiue) =
00108       cost(create_trees) +  (see #1)
00109       cost(merge) +         (see #2)
00110       cost(read_result)     (see #3)
00111 
00112     1. Cost of trees creation
00113       For each Unique::put operation there will be 2*log2(n+1) elements
00114       comparisons, where n runs from 1 tree_size (we assume that all added
00115       elements are different). Together this gives:
00116 
00117       n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!)
00118 
00119       then cost(tree_creation) = n_compares*ROWID_COMPARE_COST;
00120 
00121       Total cost of creating trees:
00122       (n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost.
00123 
00124       Approximate value of log2(N!) is calculated by log2_n_fact function.
00125 
00126        
00127       (The Next two are historical, we do all unique operations in memory or fail)
00128 
00129     2. Cost of merging.
00130       If only one tree is created by Unique no merging will be necessary.
00131       Otherwise, we model execution of merge_many_buff function and count
00132       #of merges. (The reason behind this is that number of buffers is small,
00133       while size of buffers is big and we don't want to loose precision with
00134       O(x)-style formula)
00135 
00136     3. If only one tree is created by Unique no disk io will happen.
00137       Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume
00138       these will be random seeks.
00139 */
00140 
00141 double Unique::get_use_cost(uint32_t *, uint32_t nkeys, uint32_t key_size,
00142                             size_t max_in_memory_size_arg)
00143 {
00144   ulong max_elements_in_tree;
00145   ulong last_tree_elems;
00146   double result;
00147 
00148   max_elements_in_tree= ((ulong) max_in_memory_size_arg /
00149                          ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size));
00150 
00151   last_tree_elems= nkeys % max_elements_in_tree;
00152 
00153   /* Calculate cost of creating trees */
00154   result= 2*log2_n_fact(last_tree_elems + 1.0);
00155   result /= TIME_FOR_COMPARE_ROWID;
00156 
00157   return result;
00158 }
00159 
00160 Unique::~Unique()
00161 {
00162   delete_tree(&tree);
00163 }
00164 
00165 
00166 /*
00167   Clear the tree.
00168   You must call reset() if you want to reuse Unique after walk().
00169 */
00170 
00171 void
00172 Unique::reset()
00173 {
00174   reset_tree(&tree);
00175   assert(elements == 0);
00176 }
00177 
00178 
00179 /*
00180   DESCRIPTION
00181     Walks consecutively through all unique elements:
00182     if all elements are in memory, then it simply invokes 'tree_walk', else
00183     all flushed trees are loaded to memory piece-by-piece, pieces are
00184     sorted, and action is called for each unique value.
00185     Note: so as merging resets file_ptrs state, this method can change
00186     internal Unique state to undefined: if you want to reuse Unique after
00187     walk() you must call reset() first!
00188   SYNOPSIS
00189     Unique:walk()
00190   All params are 'IN':
00191     action  function-visitor, typed in include/tree.h
00192             function is called for each unique element
00193     arg     argument for visitor, which is passed to it on each call
00194   RETURN VALUE
00195     0    OK
00196     <> 0 error
00197  */
00198 
00199 bool Unique::walk(tree_walk_action action, void *walk_action_arg)
00200 {
00201   return tree_walk(&tree, action, walk_action_arg, left_root_right);
00202 }
00203 
00204 /*
00205   Modify the Table element so that when one calls init_records()
00206   the rows will be read in priority order.
00207 */
00208 
00209 bool Unique::get(Table *table)
00210 {
00211   table->sort.found_records= elements+tree.elements_in_tree;
00212 
00213   if ((record_pointers=table->sort.record_pointers= (unsigned char*)
00214        malloc(size * tree.elements_in_tree)))
00215   {
00216     (void) tree_walk(&tree, (tree_walk_action) unique_write_to_ptrs,
00217                      this, left_root_right);
00218     return 0;
00219   }
00220   /* Not enough memory */
00221   return 1;
00222 }
00223 
00224 } /* namespace drizzled */