Drizzled Public API Documentation

btr0cur.cc

00001 /*****************************************************************************
00002 
00003 Copyright (C) 1994, 2010, Innobase Oy. All Rights Reserved.
00004 Copyright (C) 2008, Google Inc.
00005 
00006 Portions of this file contain modifications contributed and copyrighted by
00007 Google, Inc. Those modifications are gratefully acknowledged and are described
00008 briefly in the InnoDB documentation. The contributions by Google are
00009 incorporated with their permission, and subject to the conditions contained in
00010 the file COPYING.Google.
00011 
00012 This program is free software; you can redistribute it and/or modify it under
00013 the terms of the GNU General Public License as published by the Free Software
00014 Foundation; version 2 of the License.
00015 
00016 This program is distributed in the hope that it will be useful, but WITHOUT
00017 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00018 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
00019 
00020 You should have received a copy of the GNU General Public License along with
00021 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
00022 St, Fifth Floor, Boston, MA 02110-1301 USA
00023 
00024 *****************************************************************************/
00025 
00026 /**************************************************/
00044 #include "btr0cur.h"
00045 
00046 #ifdef UNIV_NONINL
00047 #include "btr0cur.ic"
00048 #endif
00049 
00050 #include "row0upd.h"
00051 #ifndef UNIV_HOTBACKUP
00052 #include "mtr0log.h"
00053 #include "page0page.h"
00054 #include "page0zip.h"
00055 #include "rem0rec.h"
00056 #include "rem0cmp.h"
00057 #include "buf0lru.h"
00058 #include "btr0btr.h"
00059 #include "btr0sea.h"
00060 #include "row0purge.h"
00061 #include "row0upd.h"
00062 #include "trx0rec.h"
00063 #include "trx0roll.h" /* trx_is_recv() */
00064 #include "que0que.h"
00065 #include "row0row.h"
00066 #include "srv0srv.h"
00067 #include "ibuf0ibuf.h"
00068 #include "lock0lock.h"
00069 #include "zlib.h"
00070 
00072 typedef enum btr_op_enum {
00073   BTR_NO_OP = 0,      
00074   BTR_INSERT_OP,      
00075   BTR_INSERT_IGNORE_UNIQUE_OP,  
00076   BTR_DELETE_OP,      
00077   BTR_DELMARK_OP      
00078 } btr_op_t;
00079 
00080 #ifdef UNIV_DEBUG
00081 
00083 UNIV_INTERN ibool btr_cur_print_record_ops = FALSE;
00084 #endif /* UNIV_DEBUG */
00085 
00087 UNIV_INTERN ulint btr_cur_n_non_sea = 0;
00090 UNIV_INTERN ulint btr_cur_n_sea   = 0;
00094 UNIV_INTERN ulint btr_cur_n_non_sea_old = 0;
00098 UNIV_INTERN ulint btr_cur_n_sea_old = 0;
00099 
00102 #define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
00103 
00105 /* @{ */
00106 /*--------------------------------------*/
00107 #define BTR_BLOB_HDR_PART_LEN   0 
00109 #define BTR_BLOB_HDR_NEXT_PAGE_NO 4 
00111 /*--------------------------------------*/
00112 #define BTR_BLOB_HDR_SIZE   8 
00114 /* @} */
00115 #endif /* !UNIV_HOTBACKUP */
00116 
00120 const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE]= {0};
00121 
00122 #ifndef UNIV_HOTBACKUP
00123 /*******************************************************************/
00127 static
00128 void
00129 btr_cur_unmark_extern_fields(
00130 /*=========================*/
00131   page_zip_des_t* page_zip,
00133   rec_t*    rec,  
00134   dict_index_t* index,  
00135   const ulint*  offsets,
00136   mtr_t*    mtr); 
00137 /*******************************************************************/
00140 static
00141 void
00142 btr_cur_add_path_info(
00143 /*==================*/
00144   btr_cur_t*  cursor,   
00145   ulint   height,   
00147   ulint   root_height); 
00148 /***********************************************************/
00151 static
00152 void
00153 btr_rec_free_updated_extern_fields(
00154 /*===============================*/
00155   dict_index_t* index,  
00157   rec_t*    rec,  
00158   page_zip_des_t* page_zip,
00160   const ulint*  offsets,
00161   const upd_t*  update, 
00162   enum trx_rb_ctx rb_ctx, 
00163   mtr_t*    mtr); 
00165 /***********************************************************/
00167 static
00168 void
00169 btr_rec_free_externally_stored_fields(
00170 /*==================================*/
00171   dict_index_t* index,  
00173   rec_t*    rec,  
00174   const ulint*  offsets,
00175   page_zip_des_t* page_zip,
00177   enum trx_rb_ctx rb_ctx, 
00178   mtr_t*    mtr); 
00181 /***********************************************************/
00184 static
00185 ulint
00186 btr_rec_get_externally_stored_len(
00187 /*==============================*/
00188   rec_t*    rec,  
00189   const ulint*  offsets);
00190 #endif /* !UNIV_HOTBACKUP */
00191 
00192 /******************************************************/
00194 UNIV_INLINE
00195 void
00196 btr_rec_set_deleted_flag(
00197 /*=====================*/
00198   rec_t*    rec,  
00199   page_zip_des_t* page_zip,
00200   ulint   flag) 
00201 {
00202   if (page_rec_is_comp(rec)) {
00203     rec_set_deleted_flag_new(rec, page_zip, flag);
00204   } else {
00205     ut_ad(!page_zip);
00206     rec_set_deleted_flag_old(rec, flag);
00207   }
00208 }
00209 
00210 #ifndef UNIV_HOTBACKUP
00211 /*==================== B-TREE SEARCH =========================*/
00212 
00213 /********************************************************************/
00215 static
00216 void
00217 btr_cur_latch_leaves(
00218 /*=================*/
00219   page_t*   page,   
00221   ulint   space,    
00222   ulint   zip_size, 
00224   ulint   page_no,  
00225   ulint   latch_mode, 
00226   btr_cur_t*  cursor,   
00227   mtr_t*    mtr)    
00228 {
00229   ulint   mode;
00230   ulint   left_page_no;
00231   ulint   right_page_no;
00232   buf_block_t*  get_block;
00233 
00234   ut_ad(page && mtr);
00235 
00236   switch (latch_mode) {
00237   case BTR_SEARCH_LEAF:
00238   case BTR_MODIFY_LEAF:
00239     mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
00240     get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
00241 #ifdef UNIV_BTR_DEBUG
00242     ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
00243 #endif /* UNIV_BTR_DEBUG */
00244     get_block->check_index_page_at_flush = TRUE;
00245     return;
00246   case BTR_MODIFY_TREE:
00247     /* x-latch also brothers from left to right */
00248     left_page_no = btr_page_get_prev(page, mtr);
00249 
00250     if (left_page_no != FIL_NULL) {
00251       get_block = btr_block_get(space, zip_size,
00252               left_page_no,
00253               RW_X_LATCH, mtr);
00254 #ifdef UNIV_BTR_DEBUG
00255       ut_a(page_is_comp(get_block->frame)
00256            == page_is_comp(page));
00257       ut_a(btr_page_get_next(get_block->frame, mtr)
00258            == page_get_page_no(page));
00259 #endif /* UNIV_BTR_DEBUG */
00260       get_block->check_index_page_at_flush = TRUE;
00261     }
00262 
00263     get_block = btr_block_get(space, zip_size, page_no,
00264             RW_X_LATCH, mtr);
00265 #ifdef UNIV_BTR_DEBUG
00266     ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
00267 #endif /* UNIV_BTR_DEBUG */
00268     get_block->check_index_page_at_flush = TRUE;
00269 
00270     right_page_no = btr_page_get_next(page, mtr);
00271 
00272     if (right_page_no != FIL_NULL) {
00273       get_block = btr_block_get(space, zip_size,
00274               right_page_no,
00275               RW_X_LATCH, mtr);
00276 #ifdef UNIV_BTR_DEBUG
00277       ut_a(page_is_comp(get_block->frame)
00278            == page_is_comp(page));
00279       ut_a(btr_page_get_prev(get_block->frame, mtr)
00280            == page_get_page_no(page));
00281 #endif /* UNIV_BTR_DEBUG */
00282       get_block->check_index_page_at_flush = TRUE;
00283     }
00284 
00285     return;
00286 
00287   case BTR_SEARCH_PREV:
00288   case BTR_MODIFY_PREV:
00289     mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
00290     /* latch also left brother */
00291     left_page_no = btr_page_get_prev(page, mtr);
00292 
00293     if (left_page_no != FIL_NULL) {
00294       get_block = btr_block_get(space, zip_size,
00295               left_page_no, mode, mtr);
00296       cursor->left_block = get_block;
00297 #ifdef UNIV_BTR_DEBUG
00298       ut_a(page_is_comp(get_block->frame)
00299            == page_is_comp(page));
00300       ut_a(btr_page_get_next(get_block->frame, mtr)
00301            == page_get_page_no(page));
00302 #endif /* UNIV_BTR_DEBUG */
00303       get_block->check_index_page_at_flush = TRUE;
00304     }
00305 
00306     get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
00307 #ifdef UNIV_BTR_DEBUG
00308     ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
00309 #endif /* UNIV_BTR_DEBUG */
00310     get_block->check_index_page_at_flush = TRUE;
00311     return;
00312   }
00313 
00314   ut_error;
00315 }
00316 
00317 /********************************************************************/
00329 UNIV_INTERN
00330 void
00331 btr_cur_search_to_nth_level(
00332 /*========================*/
00333   dict_index_t* index,  
00334   ulint   level,  
00335   const dtuple_t* tuple,  
00338   ulint   mode, 
00341   ulint   latch_mode, 
00352   btr_cur_t*  cursor, 
00354   ulint   has_search_latch,
00357   const char* file, 
00358   ulint   line, 
00359   mtr_t*    mtr)  
00360 {
00361   page_t*   page;
00362   buf_block_t*  block;
00363   ulint   space;
00364   buf_block_t*  guess;
00365   ulint   height;
00366   ulint   page_no;
00367   ulint   up_match;
00368   ulint   up_bytes;
00369   ulint   low_match;
00370   ulint   low_bytes;
00371   ulint   savepoint;
00372   ulint   rw_latch;
00373   ulint   page_mode;
00374   ulint   buf_mode;
00375   ulint   estimate;
00376   ulint   zip_size;
00377   page_cur_t* page_cursor;
00378   btr_op_t  btr_op;
00379   ulint   root_height = 0; /* remove warning */
00380 
00381 #ifdef BTR_CUR_ADAPT
00382   btr_search_t* info;
00383 #endif
00384   mem_heap_t* heap    = NULL;
00385   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
00386   ulint*    offsets   = offsets_;
00387   rec_offs_init(offsets_);
00388   /* Currently, PAGE_CUR_LE is the only search mode used for searches
00389   ending to upper levels */
00390 
00391   ut_ad(level == 0 || mode == PAGE_CUR_LE);
00392   ut_ad(dict_index_check_search_tuple(index, tuple));
00393   ut_ad(!dict_index_is_ibuf(index) || ibuf_inside());
00394   ut_ad(dtuple_check_typed(tuple));
00395 
00396 #ifdef UNIV_DEBUG
00397   cursor->up_match = ULINT_UNDEFINED;
00398   cursor->low_match = ULINT_UNDEFINED;
00399 #endif
00400 
00401   /* These flags are mutually exclusive, they are lumped together
00402   with the latch mode for historical reasons. It's possible for
00403   none of the flags to be set. */
00404   switch (UNIV_EXPECT(latch_mode
00405           & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
00406           0)) {
00407   case 0:
00408     btr_op = BTR_NO_OP;
00409     break;
00410   case BTR_INSERT:
00411     btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
00412       ? BTR_INSERT_IGNORE_UNIQUE_OP
00413       : BTR_INSERT_OP;
00414     break;
00415   case BTR_DELETE:
00416     btr_op = BTR_DELETE_OP;
00417     ut_a(cursor->purge_node);
00418     break;
00419   case BTR_DELETE_MARK:
00420     btr_op = BTR_DELMARK_OP;
00421     break;
00422   default:
00423     /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
00424     should be specified at a time */
00425     ut_error;
00426   }
00427 
00428   /* Operations on the insert buffer tree cannot be buffered. */
00429   ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
00430   /* Operations on the clustered index cannot be buffered. */
00431   ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
00432 
00433   estimate = latch_mode & BTR_ESTIMATE;
00434 
00435   /* Turn the flags unrelated to the latch mode off. */
00436   latch_mode &= ~(BTR_INSERT
00437       | BTR_DELETE_MARK
00438       | BTR_DELETE
00439       | BTR_ESTIMATE
00440       | BTR_IGNORE_SEC_UNIQUE);
00441 
00442   cursor->flag = BTR_CUR_BINARY;
00443   cursor->index = index;
00444 
00445   cursor->ibuf_cnt = ULINT_UNDEFINED;
00446 
00447 #ifndef BTR_CUR_ADAPT
00448   guess = NULL;
00449 #else
00450   info = btr_search_get_info(index);
00451 
00452   guess = info->root_guess;
00453 
00454 #ifdef BTR_CUR_HASH_ADAPT
00455 
00456 #ifdef UNIV_SEARCH_PERF_STAT
00457   info->n_searches++;
00458 #endif
00459   if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
00460       && latch_mode <= BTR_MODIFY_LEAF
00461       && info->last_hash_succ
00462       && !estimate
00463 #ifdef PAGE_CUR_LE_OR_EXTENDS
00464       && mode != PAGE_CUR_LE_OR_EXTENDS
00465 #endif /* PAGE_CUR_LE_OR_EXTENDS */
00466       /* If !has_search_latch, we do a dirty read of
00467       btr_search_enabled below, and btr_search_guess_on_hash()
00468       will have to check it again. */
00469       && UNIV_LIKELY(btr_search_enabled)
00470       && btr_search_guess_on_hash(index, info, tuple, mode,
00471           latch_mode, cursor,
00472           has_search_latch, mtr)) {
00473 
00474     /* Search using the hash index succeeded */
00475 
00476     ut_ad(cursor->up_match != ULINT_UNDEFINED
00477           || mode != PAGE_CUR_GE);
00478     ut_ad(cursor->up_match != ULINT_UNDEFINED
00479           || mode != PAGE_CUR_LE);
00480     ut_ad(cursor->low_match != ULINT_UNDEFINED
00481           || mode != PAGE_CUR_LE);
00482     btr_cur_n_sea++;
00483 
00484     return;
00485   }
00486 #endif /* BTR_CUR_HASH_ADAPT */
00487 #endif /* BTR_CUR_ADAPT */
00488   btr_cur_n_non_sea++;
00489 
00490   /* If the hash search did not succeed, do binary search down the
00491   tree */
00492 
00493   if (has_search_latch) {
00494     /* Release possible search latch to obey latching order */
00495     rw_lock_s_unlock(&btr_search_latch);
00496   }
00497 
00498   /* Store the position of the tree latch we push to mtr so that we
00499   know how to release it when we have latched leaf node(s) */
00500 
00501   savepoint = mtr_set_savepoint(mtr);
00502 
00503   if (latch_mode == BTR_MODIFY_TREE) {
00504     mtr_x_lock(dict_index_get_lock(index), mtr);
00505 
00506   } else if (latch_mode == BTR_CONT_MODIFY_TREE) {
00507     /* Do nothing */
00508     ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
00509           MTR_MEMO_X_LOCK));
00510   } else {
00511     mtr_s_lock(dict_index_get_lock(index), mtr);
00512   }
00513 
00514   page_cursor = btr_cur_get_page_cur(cursor);
00515 
00516   space = dict_index_get_space(index);
00517   page_no = dict_index_get_page(index);
00518 
00519   up_match = 0;
00520   up_bytes = 0;
00521   low_match = 0;
00522   low_bytes = 0;
00523 
00524   height = ULINT_UNDEFINED;
00525 
00526   /* We use these modified search modes on non-leaf levels of the
00527   B-tree. These let us end up in the right B-tree leaf. In that leaf
00528   we use the original search mode. */
00529 
00530   switch (mode) {
00531   case PAGE_CUR_GE:
00532     page_mode = PAGE_CUR_L;
00533     break;
00534   case PAGE_CUR_G:
00535     page_mode = PAGE_CUR_LE;
00536     break;
00537   default:
00538 #ifdef PAGE_CUR_LE_OR_EXTENDS
00539     ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
00540           || mode == PAGE_CUR_LE_OR_EXTENDS);
00541 #else /* PAGE_CUR_LE_OR_EXTENDS */
00542     ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
00543 #endif /* PAGE_CUR_LE_OR_EXTENDS */
00544     page_mode = mode;
00545     break;
00546   }
00547 
00548   /* Loop and search until we arrive at the desired level */
00549 
00550 search_loop:
00551   buf_mode = BUF_GET;
00552   rw_latch = RW_NO_LATCH;
00553 
00554   if (height != 0) {
00555     /* We are about to fetch the root or a non-leaf page. */
00556   } else if (latch_mode <= BTR_MODIFY_LEAF) {
00557     rw_latch = latch_mode;
00558 
00559     if (btr_op != BTR_NO_OP
00560         && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
00561 
00562       /* Try to buffer the operation if the leaf
00563       page is not in the buffer pool. */
00564 
00565       buf_mode = btr_op == BTR_DELETE_OP
00566         ? BUF_GET_IF_IN_POOL_OR_WATCH
00567         : BUF_GET_IF_IN_POOL;
00568     }
00569   }
00570 
00571   zip_size = dict_table_zip_size(index->table);
00572 
00573 retry_page_get:
00574   block = buf_page_get_gen(
00575     space, zip_size, page_no, rw_latch, guess, buf_mode,
00576     file, line, mtr);
00577 
00578   if (block == NULL) {
00579     /* This must be a search to perform an insert/delete
00580     mark/ delete; try using the insert/delete buffer */
00581 
00582     ut_ad(height == 0);
00583     ut_ad(cursor->thr);
00584 
00585     switch (btr_op) {
00586     case BTR_INSERT_OP:
00587     case BTR_INSERT_IGNORE_UNIQUE_OP:
00588       ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
00589 
00590       if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
00591           space, zip_size, page_no,
00592           cursor->thr)) {
00593 
00594         cursor->flag = BTR_CUR_INSERT_TO_IBUF;
00595 
00596         goto func_exit;
00597       }
00598       break;
00599 
00600     case BTR_DELMARK_OP:
00601       ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
00602 
00603       if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
00604           index, space, zip_size,
00605           page_no, cursor->thr)) {
00606 
00607         cursor->flag = BTR_CUR_DEL_MARK_IBUF;
00608 
00609         goto func_exit;
00610       }
00611 
00612       break;
00613 
00614     case BTR_DELETE_OP:
00615       ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
00616 
00617       if (!row_purge_poss_sec(cursor->purge_node,
00618             index, tuple)) {
00619 
00620         /* The record cannot be purged yet. */
00621         cursor->flag = BTR_CUR_DELETE_REF;
00622       } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
00623                  index, space, zip_size,
00624                  page_no,
00625                  cursor->thr)) {
00626 
00627         /* The purge was buffered. */
00628         cursor->flag = BTR_CUR_DELETE_IBUF;
00629       } else {
00630         /* The purge could not be buffered. */
00631         buf_pool_watch_unset(space, page_no);
00632         break;
00633       }
00634 
00635       buf_pool_watch_unset(space, page_no);
00636       goto func_exit;
00637 
00638     default:
00639       ut_error;
00640     }
00641 
00642     /* Insert to the insert/delete buffer did not succeed, we
00643     must read the page from disk. */
00644 
00645     buf_mode = BUF_GET;
00646 
00647     goto retry_page_get;
00648   }
00649 
00650   block->check_index_page_at_flush = TRUE;
00651   page = buf_block_get_frame(block);
00652 
00653   if (rw_latch != RW_NO_LATCH) {
00654 #ifdef UNIV_ZIP_DEBUG
00655     const page_zip_des_t* page_zip
00656       = buf_block_get_page_zip(block);
00657     ut_a(!page_zip || page_zip_validate(page_zip, page));
00658 #endif /* UNIV_ZIP_DEBUG */
00659 
00660     buf_block_dbg_add_level(block, SYNC_TREE_NODE);
00661   }
00662 
00663   ut_ad(index->id == btr_page_get_index_id(page));
00664 
00665   if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
00666     /* We are in the root node */
00667 
00668     height = btr_page_get_level(page, mtr);
00669     root_height = height;
00670     cursor->tree_height = root_height + 1;
00671 
00672 #ifdef BTR_CUR_ADAPT
00673     if (block != guess) {
00674       info->root_guess = block;
00675     }
00676 #endif
00677   }
00678 
00679   if (height == 0) {
00680     if (rw_latch == RW_NO_LATCH) {
00681 
00682       btr_cur_latch_leaves(
00683         page, space, zip_size, page_no, latch_mode,
00684         cursor, mtr);
00685     }
00686 
00687     if (latch_mode != BTR_MODIFY_TREE
00688         && latch_mode != BTR_CONT_MODIFY_TREE) {
00689 
00690       /* Release the tree s-latch */
00691 
00692       mtr_release_s_latch_at_savepoint(
00693         mtr, savepoint, dict_index_get_lock(index));
00694     }
00695 
00696     page_mode = mode;
00697   }
00698 
00699   page_cur_search_with_match(
00700     block, index, tuple, page_mode, &up_match, &up_bytes,
00701     &low_match, &low_bytes, page_cursor);
00702 
00703   if (estimate) {
00704     btr_cur_add_path_info(cursor, height, root_height);
00705   }
00706 
00707   /* If this is the desired level, leave the loop */
00708 
00709   ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
00710              mtr));
00711 
00712   if (level != height) {
00713 
00714     const rec_t*  node_ptr;
00715     ut_ad(height > 0);
00716 
00717     height--;
00718     guess = NULL;
00719 
00720     node_ptr = page_cur_get_rec(page_cursor);
00721 
00722     offsets = rec_get_offsets(
00723       node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
00724 
00725     /* Go to the child node */
00726     page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
00727 
00728     if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
00729       /* We're doing a search on an ibuf tree and we're one
00730       level above the leaf page. */
00731 
00732       ulint is_min_rec;
00733 
00734       ut_ad(level == 0);
00735 
00736       is_min_rec = rec_get_info_bits(node_ptr, 0)
00737         & REC_INFO_MIN_REC_FLAG;
00738 
00739       if (!is_min_rec) {
00740         cursor->ibuf_cnt
00741           = ibuf_rec_get_counter(node_ptr);
00742 
00743         ut_a(cursor->ibuf_cnt <= 0xFFFF
00744              || cursor->ibuf_cnt == ULINT_UNDEFINED);
00745       }
00746 
00747       buf_mode = BUF_GET;
00748       rw_latch = RW_NO_LATCH;
00749       goto retry_page_get;
00750     }
00751 
00752     goto search_loop;
00753   }
00754 
00755   if (level != 0) {
00756     /* x-latch the page */
00757     page = btr_page_get(
00758       space, zip_size, page_no, RW_X_LATCH, mtr);
00759 
00760     ut_a((ibool)!!page_is_comp(page)
00761          == dict_table_is_comp(index->table));
00762   } else {
00763     cursor->low_match = low_match;
00764     cursor->low_bytes = low_bytes;
00765     cursor->up_match = up_match;
00766     cursor->up_bytes = up_bytes;
00767 
00768 #ifdef BTR_CUR_ADAPT
00769     /* We do a dirty read of btr_search_enabled here.  We
00770     will properly check btr_search_enabled again in
00771     btr_search_build_page_hash_index() before building a
00772     page hash index, while holding btr_search_latch. */
00773     if (UNIV_LIKELY(btr_search_enabled)) {
00774 
00775       btr_search_info_update(index, cursor);
00776     }
00777 #endif
00778     ut_ad(cursor->up_match != ULINT_UNDEFINED
00779           || mode != PAGE_CUR_GE);
00780     ut_ad(cursor->up_match != ULINT_UNDEFINED
00781           || mode != PAGE_CUR_LE);
00782     ut_ad(cursor->low_match != ULINT_UNDEFINED
00783           || mode != PAGE_CUR_LE);
00784   }
00785 
00786 func_exit:
00787 
00788   if (UNIV_LIKELY_NULL(heap)) {
00789     mem_heap_free(heap);
00790   }
00791 
00792   if (has_search_latch) {
00793 
00794     rw_lock_s_lock(&btr_search_latch);
00795   }
00796 }
00797 
00798 /*****************************************************************/
00800 UNIV_INTERN
00801 void
00802 btr_cur_open_at_index_side_func(
00803 /*============================*/
00804   ibool   from_left,  
00806   dict_index_t* index,    
00807   ulint   latch_mode, 
00808   btr_cur_t*  cursor,   
00809   const char* file,   
00810   ulint   line,   
00811   mtr_t*    mtr)    
00812 {
00813   page_cur_t* page_cursor;
00814   ulint   page_no;
00815   ulint   space;
00816   ulint   zip_size;
00817   ulint   height;
00818   ulint   root_height = 0; /* remove warning */
00819   rec_t*    node_ptr;
00820   ulint   estimate;
00821   ulint   savepoint;
00822   mem_heap_t* heap    = NULL;
00823   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
00824   ulint*    offsets   = offsets_;
00825   rec_offs_init(offsets_);
00826 
00827   estimate = latch_mode & BTR_ESTIMATE;
00828   latch_mode = latch_mode & ~BTR_ESTIMATE;
00829 
00830   /* Store the position of the tree latch we push to mtr so that we
00831   know how to release it when we have latched the leaf node */
00832 
00833   savepoint = mtr_set_savepoint(mtr);
00834 
00835   if (latch_mode == BTR_MODIFY_TREE) {
00836     mtr_x_lock(dict_index_get_lock(index), mtr);
00837   } else {
00838     mtr_s_lock(dict_index_get_lock(index), mtr);
00839   }
00840 
00841   page_cursor = btr_cur_get_page_cur(cursor);
00842   cursor->index = index;
00843 
00844   space = dict_index_get_space(index);
00845   zip_size = dict_table_zip_size(index->table);
00846   page_no = dict_index_get_page(index);
00847 
00848   height = ULINT_UNDEFINED;
00849 
00850   for (;;) {
00851     buf_block_t*  block;
00852     page_t*   page;
00853     block = buf_page_get_gen(space, zip_size, page_no,
00854            RW_NO_LATCH, NULL, BUF_GET,
00855            file, line, mtr);
00856     page = buf_block_get_frame(block);
00857     ut_ad(index->id == btr_page_get_index_id(page));
00858 
00859     block->check_index_page_at_flush = TRUE;
00860 
00861     if (height == ULINT_UNDEFINED) {
00862       /* We are in the root node */
00863 
00864       height = btr_page_get_level(page, mtr);
00865       root_height = height;
00866     }
00867 
00868     if (height == 0) {
00869       btr_cur_latch_leaves(page, space, zip_size, page_no,
00870                latch_mode, cursor, mtr);
00871 
00872       /* In versions <= 3.23.52 we had forgotten to
00873       release the tree latch here. If in an index scan
00874       we had to scan far to find a record visible to the
00875       current transaction, that could starve others
00876       waiting for the tree latch. */
00877 
00878       if ((latch_mode != BTR_MODIFY_TREE)
00879           && (latch_mode != BTR_CONT_MODIFY_TREE)) {
00880 
00881         /* Release the tree s-latch */
00882 
00883         mtr_release_s_latch_at_savepoint(
00884           mtr, savepoint,
00885           dict_index_get_lock(index));
00886       }
00887     }
00888 
00889     if (from_left) {
00890       page_cur_set_before_first(block, page_cursor);
00891     } else {
00892       page_cur_set_after_last(block, page_cursor);
00893     }
00894 
00895     if (height == 0) {
00896       if (estimate) {
00897         btr_cur_add_path_info(cursor, height,
00898                   root_height);
00899       }
00900 
00901       break;
00902     }
00903 
00904     ut_ad(height > 0);
00905 
00906     if (from_left) {
00907       page_cur_move_to_next(page_cursor);
00908     } else {
00909       page_cur_move_to_prev(page_cursor);
00910     }
00911 
00912     if (estimate) {
00913       btr_cur_add_path_info(cursor, height, root_height);
00914     }
00915 
00916     height--;
00917 
00918     node_ptr = page_cur_get_rec(page_cursor);
00919     offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
00920             ULINT_UNDEFINED, &heap);
00921     /* Go to the child node */
00922     page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
00923   }
00924 
00925   if (UNIV_LIKELY_NULL(heap)) {
00926     mem_heap_free(heap);
00927   }
00928 }
00929 
00930 /**********************************************************************/
00932 UNIV_INTERN
00933 void
00934 btr_cur_open_at_rnd_pos_func(
00935 /*=========================*/
00936   dict_index_t* index,    
00937   ulint   latch_mode, 
00938   btr_cur_t*  cursor,   
00939   const char* file,   
00940   ulint   line,   
00941   mtr_t*    mtr)    
00942 {
00943   page_cur_t* page_cursor;
00944   ulint   page_no;
00945   ulint   space;
00946   ulint   zip_size;
00947   ulint   height;
00948   rec_t*    node_ptr;
00949   mem_heap_t* heap    = NULL;
00950   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
00951   ulint*    offsets   = offsets_;
00952   rec_offs_init(offsets_);
00953 
00954   if (latch_mode == BTR_MODIFY_TREE) {
00955     mtr_x_lock(dict_index_get_lock(index), mtr);
00956   } else {
00957     mtr_s_lock(dict_index_get_lock(index), mtr);
00958   }
00959 
00960   page_cursor = btr_cur_get_page_cur(cursor);
00961   cursor->index = index;
00962 
00963   space = dict_index_get_space(index);
00964   zip_size = dict_table_zip_size(index->table);
00965   page_no = dict_index_get_page(index);
00966 
00967   height = ULINT_UNDEFINED;
00968 
00969   for (;;) {
00970     buf_block_t*  block;
00971     page_t*   page;
00972 
00973     block = buf_page_get_gen(space, zip_size, page_no,
00974            RW_NO_LATCH, NULL, BUF_GET,
00975            file, line, mtr);
00976     page = buf_block_get_frame(block);
00977     ut_ad(index->id == btr_page_get_index_id(page));
00978 
00979     if (height == ULINT_UNDEFINED) {
00980       /* We are in the root node */
00981 
00982       height = btr_page_get_level(page, mtr);
00983     }
00984 
00985     if (height == 0) {
00986       btr_cur_latch_leaves(page, space, zip_size, page_no,
00987                latch_mode, cursor, mtr);
00988     }
00989 
00990     page_cur_open_on_rnd_user_rec(block, page_cursor);
00991 
00992     if (height == 0) {
00993 
00994       break;
00995     }
00996 
00997     ut_ad(height > 0);
00998 
00999     height--;
01000 
01001     node_ptr = page_cur_get_rec(page_cursor);
01002     offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
01003             ULINT_UNDEFINED, &heap);
01004     /* Go to the child node */
01005     page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
01006   }
01007 
01008   if (UNIV_LIKELY_NULL(heap)) {
01009     mem_heap_free(heap);
01010   }
01011 }
01012 
01013 /*==================== B-TREE INSERT =========================*/
01014 
01015 /*************************************************************/
01021 static
01022 rec_t*
01023 btr_cur_insert_if_possible(
01024 /*=======================*/
01025   btr_cur_t*  cursor, 
01027   const dtuple_t* tuple,  
01029   ulint   n_ext,  
01030   mtr_t*    mtr)  
01031 {
01032   page_cur_t* page_cursor;
01033   buf_block_t*  block;
01034   rec_t*    rec;
01035 
01036   ut_ad(dtuple_check_typed(tuple));
01037 
01038   block = btr_cur_get_block(cursor);
01039 
01040   ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
01041   page_cursor = btr_cur_get_page_cur(cursor);
01042 
01043   /* Now, try the insert */
01044   rec = page_cur_tuple_insert(page_cursor, tuple,
01045             cursor->index, n_ext, mtr);
01046 
01047   if (UNIV_UNLIKELY(!rec)) {
01048     /* If record did not fit, reorganize */
01049 
01050     if (btr_page_reorganize(block, cursor->index, mtr)) {
01051 
01052       page_cur_search(block, cursor->index, tuple,
01053           PAGE_CUR_LE, page_cursor);
01054 
01055       rec = page_cur_tuple_insert(page_cursor, tuple,
01056                 cursor->index, n_ext, mtr);
01057     }
01058   }
01059 
01060   return(rec);
01061 }
01062 
01063 /*************************************************************/
01066 UNIV_INLINE
01067 ulint
01068 btr_cur_ins_lock_and_undo(
01069 /*======================*/
01070   ulint   flags,  
01073   btr_cur_t*  cursor, 
01074   dtuple_t* entry,  
01075   que_thr_t*  thr,  
01076   mtr_t*    mtr,  
01077   ibool*    inherit)
01080 {
01081   dict_index_t* index;
01082   ulint   err;
01083   rec_t*    rec;
01084   roll_ptr_t  roll_ptr;
01085 
01086   /* Check if we have to wait for a lock: enqueue an explicit lock
01087   request if yes */
01088 
01089   rec = btr_cur_get_rec(cursor);
01090   index = cursor->index;
01091 
01092   err = lock_rec_insert_check_and_lock(flags, rec,
01093                btr_cur_get_block(cursor),
01094                index, thr, mtr, inherit);
01095 
01096   if (err != DB_SUCCESS) {
01097 
01098     return(err);
01099   }
01100 
01101   if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
01102 
01103     err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
01104                 thr, index, entry,
01105                 NULL, 0, NULL,
01106                 &roll_ptr);
01107     if (err != DB_SUCCESS) {
01108 
01109       return(err);
01110     }
01111 
01112     /* Now we can fill in the roll ptr field in entry */
01113 
01114     if (!(flags & BTR_KEEP_SYS_FLAG)) {
01115 
01116       row_upd_index_entry_sys_field(entry, index,
01117                   DATA_ROLL_PTR, roll_ptr);
01118     }
01119   }
01120 
01121   return(DB_SUCCESS);
01122 }
01123 
01124 #ifdef UNIV_DEBUG
01125 /*************************************************************/
01127 static
01128 void
01129 btr_cur_trx_report(
01130 /*===============*/
01131   trx_t*      trx,  
01132   const dict_index_t* index,  
01133   const char*   op) 
01134 {
01135   fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
01136     (ullint) trx->id);
01137   fputs(op, stderr);
01138   dict_index_name_print(stderr, trx, index);
01139   putc('\n', stderr);
01140 }
01141 #endif /* UNIV_DEBUG */
01142 
01143 /*************************************************************/
01150 UNIV_INTERN
01151 ulint
01152 btr_cur_optimistic_insert(
01153 /*======================*/
01154   ulint   flags,  
01157   btr_cur_t*  cursor, 
01159   dtuple_t* entry,  
01160   rec_t**   rec,  
01162   big_rec_t** big_rec,
01165   ulint   n_ext,  
01166   que_thr_t*  thr,  
01167   mtr_t*    mtr)  
01172 {
01173   big_rec_t*  big_rec_vec = NULL;
01174   dict_index_t* index;
01175   page_cur_t* page_cursor;
01176   buf_block_t*  block;
01177   page_t*   page;
01178   ulint   max_size;
01179   rec_t*    dummy_rec;
01180   ibool   leaf;
01181   ibool   reorg;
01182   ibool   inherit;
01183   ulint   zip_size;
01184   ulint   rec_size;
01185   ulint   err;
01186 
01187   *big_rec = NULL;
01188 
01189   block = btr_cur_get_block(cursor);
01190   page = buf_block_get_frame(block);
01191   index = cursor->index;
01192   zip_size = buf_block_get_zip_size(block);
01193 #ifdef UNIV_DEBUG_VALGRIND
01194   if (zip_size) {
01195     UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
01196     UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
01197   }
01198 #endif /* UNIV_DEBUG_VALGRIND */
01199 
01200   if (!dtuple_check_typed_no_assert(entry)) {
01201     fputs("InnoDB: Error in a tuple to insert into ", stderr);
01202     dict_index_name_print(stderr, thr_get_trx(thr), index);
01203   }
01204 #ifdef UNIV_DEBUG
01205   if (btr_cur_print_record_ops && thr) {
01206     btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
01207     dtuple_print(stderr, entry);
01208   }
01209 #endif /* UNIV_DEBUG */
01210 
01211   ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
01212   max_size = page_get_max_insert_size_after_reorganize(page, 1);
01213   leaf = page_is_leaf(page);
01214 
01215   /* Calculate the record size when entry is converted to a record */
01216   rec_size = rec_get_converted_size(index, entry, n_ext);
01217 
01218   if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
01219            dtuple_get_n_fields(entry), zip_size)) {
01220 
01221     /* The record is so big that we have to store some fields
01222     externally on separate database pages */
01223     big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
01224 
01225     if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
01226 
01227       return(DB_TOO_BIG_RECORD);
01228     }
01229 
01230     rec_size = rec_get_converted_size(index, entry, n_ext);
01231   }
01232 
01233   if (UNIV_UNLIKELY(zip_size)) {
01234     /* Estimate the free space of an empty compressed page.
01235     Subtract one byte for the encoded heap_no in the
01236     modification log. */
01237     ulint free_space_zip = page_zip_empty_size(
01238       cursor->index->n_fields, zip_size) - 1;
01239     ulint n_uniq = dict_index_get_n_unique_in_tree(index);
01240 
01241     ut_ad(dict_table_is_comp(index->table));
01242 
01243     /* There should be enough room for two node pointer
01244     records on an empty non-leaf page.  This prevents
01245     infinite page splits. */
01246 
01247     if (UNIV_LIKELY(entry->n_fields >= n_uniq)
01248         && UNIV_UNLIKELY(REC_NODE_PTR_SIZE
01249              + rec_get_converted_size_comp_prefix(
01250                index, entry->fields, n_uniq,
01251                NULL)
01252              /* On a compressed page, there is
01253              a two-byte entry in the dense
01254              page directory for every record.
01255              But there is no record header. */
01256              - (REC_N_NEW_EXTRA_BYTES - 2)
01257              > free_space_zip / 2)) {
01258 
01259       if (big_rec_vec) {
01260         dtuple_convert_back_big_rec(
01261           index, entry, big_rec_vec);
01262       }
01263 
01264       return(DB_TOO_BIG_RECORD);
01265     }
01266   }
01267 
01268   /* If there have been many consecutive inserts, and we are on the leaf
01269   level, check if we have to split the page to reserve enough free space
01270   for future updates of records. */
01271 
01272   if (dict_index_is_clust(index)
01273       && (page_get_n_recs(page) >= 2)
01274       && UNIV_LIKELY(leaf)
01275       && (dict_index_get_space_reserve() + rec_size > max_size)
01276       && (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
01277     || btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
01278 fail:
01279     err = DB_FAIL;
01280 fail_err:
01281 
01282     if (big_rec_vec) {
01283       dtuple_convert_back_big_rec(index, entry, big_rec_vec);
01284     }
01285 
01286     return(err);
01287   }
01288 
01289   if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
01290         || max_size < rec_size)
01291       && UNIV_LIKELY(page_get_n_recs(page) > 1)
01292       && page_get_max_insert_size(page, 1) < rec_size) {
01293 
01294     goto fail;
01295   }
01296 
01297   /* Check locks and write to the undo log, if specified */
01298   err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
01299           thr, mtr, &inherit);
01300 
01301   if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
01302 
01303     goto fail_err;
01304   }
01305 
01306   page_cursor = btr_cur_get_page_cur(cursor);
01307 
01308   /* Now, try the insert */
01309 
01310   {
01311     const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
01312     *rec = page_cur_tuple_insert(page_cursor, entry, index,
01313                n_ext, mtr);
01314     reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
01315 
01316     if (UNIV_UNLIKELY(reorg)) {
01317       ut_a(zip_size);
01318       ut_a(*rec);
01319     }
01320   }
01321 
01322   if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) {
01323     /* If the record did not fit, reorganize */
01324     if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) {
01325       ut_a(zip_size);
01326 
01327       goto fail;
01328     }
01329 
01330     ut_ad(zip_size
01331           || page_get_max_insert_size(page, 1) == max_size);
01332 
01333     reorg = TRUE;
01334 
01335     page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor);
01336 
01337     *rec = page_cur_tuple_insert(page_cursor, entry, index,
01338                n_ext, mtr);
01339 
01340     if (UNIV_UNLIKELY(!*rec)) {
01341       if (UNIV_LIKELY(zip_size != 0)) {
01342 
01343         goto fail;
01344       }
01345 
01346       fputs("InnoDB: Error: cannot insert tuple ", stderr);
01347       dtuple_print(stderr, entry);
01348       fputs(" into ", stderr);
01349       dict_index_name_print(stderr, thr_get_trx(thr), index);
01350       fprintf(stderr, "\nInnoDB: max insert size %lu\n",
01351         (ulong) max_size);
01352       ut_error;
01353     }
01354   }
01355 
01356 #ifdef BTR_CUR_HASH_ADAPT
01357   if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
01358     btr_search_update_hash_node_on_insert(cursor);
01359   } else {
01360     btr_search_update_hash_on_insert(cursor);
01361   }
01362 #endif
01363 
01364   if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
01365 
01366     lock_update_insert(block, *rec);
01367   }
01368 
01369 #if 0
01370   fprintf(stderr, "Insert into page %lu, max ins size %lu,"
01371     " rec %lu ind type %lu\n",
01372     buf_block_get_page_no(block), max_size,
01373     rec_size + PAGE_DIR_SLOT_SIZE, index->type);
01374 #endif
01375   if (leaf && !dict_index_is_clust(index)) {
01376     /* Update the free bits of the B-tree page in the
01377     insert buffer bitmap. */
01378 
01379     /* The free bits in the insert buffer bitmap must
01380     never exceed the free space on a page.  It is safe to
01381     decrement or reset the bits in the bitmap in a
01382     mini-transaction that is committed before the
01383     mini-transaction that affects the free space. */
01384 
01385     /* It is unsafe to increment the bits in a separately
01386     committed mini-transaction, because in crash recovery,
01387     the free bits could momentarily be set too high. */
01388 
01389     if (zip_size) {
01390       /* Update the bits in the same mini-transaction. */
01391       ibuf_update_free_bits_zip(block, mtr);
01392     } else {
01393       /* Decrement the bits in a separate
01394       mini-transaction. */
01395       ibuf_update_free_bits_if_full(
01396         block, max_size,
01397         rec_size + PAGE_DIR_SLOT_SIZE);
01398     }
01399   }
01400 
01401   *big_rec = big_rec_vec;
01402 
01403   return(DB_SUCCESS);
01404 }
01405 
01406 /*************************************************************/
01412 UNIV_INTERN
01413 ulint
01414 btr_cur_pessimistic_insert(
01415 /*=======================*/
01416   ulint   flags,  
01422   btr_cur_t*  cursor, 
01424   dtuple_t* entry,  
01425   rec_t**   rec,  
01427   big_rec_t** big_rec,
01430   ulint   n_ext,  
01431   que_thr_t*  thr,  
01432   mtr_t*    mtr)  
01433 {
01434   dict_index_t* index   = cursor->index;
01435   ulint   zip_size  = dict_table_zip_size(index->table);
01436   big_rec_t*  big_rec_vec = NULL;
01437   mem_heap_t* heap    = NULL;
01438   ulint   err;
01439   ibool   dummy_inh;
01440   ibool   success;
01441   ulint   n_extents = 0;
01442   ulint   n_reserved;
01443 
01444   ut_ad(dtuple_check_typed(entry));
01445 
01446   *big_rec = NULL;
01447 
01448   ut_ad(mtr_memo_contains(mtr,
01449         dict_index_get_lock(btr_cur_get_index(cursor)),
01450         MTR_MEMO_X_LOCK));
01451   ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
01452         MTR_MEMO_PAGE_X_FIX));
01453 
01454   /* Try first an optimistic insert; reset the cursor flag: we do not
01455   assume anything of how it was positioned */
01456 
01457   cursor->flag = BTR_CUR_BINARY;
01458 
01459   err = btr_cur_optimistic_insert(flags, cursor, entry, rec,
01460           big_rec, n_ext, thr, mtr);
01461   if (err != DB_FAIL) {
01462 
01463     return(err);
01464   }
01465 
01466   /* Retry with a pessimistic insert. Check locks and write to undo log,
01467   if specified */
01468 
01469   err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
01470           thr, mtr, &dummy_inh);
01471 
01472   if (err != DB_SUCCESS) {
01473 
01474     return(err);
01475   }
01476 
01477   if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
01478     /* First reserve enough free space for the file segments
01479     of the index tree, so that the insert will not fail because
01480     of lack of space */
01481 
01482     n_extents = cursor->tree_height / 16 + 3;
01483 
01484     success = fsp_reserve_free_extents(&n_reserved, index->space,
01485                n_extents, FSP_NORMAL, mtr);
01486     if (!success) {
01487       return(DB_OUT_OF_FILE_SPACE);
01488     }
01489   }
01490 
01491   if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
01492            dict_table_is_comp(index->table),
01493            dict_index_get_n_fields(index),
01494            zip_size)) {
01495     /* The record is so big that we have to store some fields
01496     externally on separate database pages */
01497 
01498     if (UNIV_LIKELY_NULL(big_rec_vec)) {
01499       /* This should never happen, but we handle
01500       the situation in a robust manner. */
01501       ut_ad(0);
01502       dtuple_convert_back_big_rec(index, entry, big_rec_vec);
01503     }
01504 
01505     big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
01506 
01507     if (big_rec_vec == NULL) {
01508 
01509       if (n_extents > 0) {
01510         fil_space_release_free_extents(index->space,
01511                      n_reserved);
01512       }
01513       return(DB_TOO_BIG_RECORD);
01514     }
01515   }
01516 
01517   if (dict_index_get_page(index)
01518       == buf_block_get_page_no(btr_cur_get_block(cursor))) {
01519 
01520     /* The page is the root page */
01521     *rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr);
01522   } else {
01523     *rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr);
01524   }
01525 
01526   if (UNIV_LIKELY_NULL(heap)) {
01527     mem_heap_free(heap);
01528   }
01529 
01530   ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
01531 
01532 #ifdef BTR_CUR_ADAPT
01533   btr_search_update_hash_on_insert(cursor);
01534 #endif
01535   if (!(flags & BTR_NO_LOCKING_FLAG)) {
01536 
01537     lock_update_insert(btr_cur_get_block(cursor), *rec);
01538   }
01539 
01540   if (n_extents > 0) {
01541     fil_space_release_free_extents(index->space, n_reserved);
01542   }
01543 
01544   *big_rec = big_rec_vec;
01545 
01546   return(DB_SUCCESS);
01547 }
01548 
01549 /*==================== B-TREE UPDATE =========================*/
01550 
01551 /*************************************************************/
01554 UNIV_INLINE
01555 ulint
01556 btr_cur_upd_lock_and_undo(
01557 /*======================*/
01558   ulint   flags,  
01559   btr_cur_t*  cursor, 
01560   const upd_t*  update, 
01561   ulint   cmpl_info,
01563   que_thr_t*  thr,  
01564   mtr_t*    mtr,  
01565   roll_ptr_t* roll_ptr)
01566 {
01567   dict_index_t* index;
01568   rec_t*    rec;
01569   ulint   err;
01570 
01571   ut_ad(cursor && update && thr && roll_ptr);
01572 
01573   rec = btr_cur_get_rec(cursor);
01574   index = cursor->index;
01575 
01576   if (!dict_index_is_clust(index)) {
01577     /* We do undo logging only when we update a clustered index
01578     record */
01579     return(lock_sec_rec_modify_check_and_lock(
01580              flags, btr_cur_get_block(cursor), rec,
01581              index, thr, mtr));
01582   }
01583 
01584   /* Check if we have to wait for a lock: enqueue an explicit lock
01585   request if yes */
01586 
01587   err = DB_SUCCESS;
01588 
01589   if (!(flags & BTR_NO_LOCKING_FLAG)) {
01590     mem_heap_t* heap    = NULL;
01591     ulint   offsets_[REC_OFFS_NORMAL_SIZE];
01592     rec_offs_init(offsets_);
01593 
01594     err = lock_clust_rec_modify_check_and_lock(
01595       flags, btr_cur_get_block(cursor), rec, index,
01596       rec_get_offsets(rec, index, offsets_,
01597           ULINT_UNDEFINED, &heap), thr);
01598     if (UNIV_LIKELY_NULL(heap)) {
01599       mem_heap_free(heap);
01600     }
01601     if (err != DB_SUCCESS) {
01602 
01603       return(err);
01604     }
01605   }
01606 
01607   /* Append the info about the update in the undo log */
01608 
01609   err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
01610               index, NULL, update,
01611               cmpl_info, rec, roll_ptr);
01612   return(err);
01613 }
01614 
01615 /***********************************************************/
01617 UNIV_INLINE
01618 void
01619 btr_cur_update_in_place_log(
01620 /*========================*/
01621   ulint   flags,    
01622   rec_t*    rec,    
01623   dict_index_t* index,    
01624   const upd_t*  update,   
01625   trx_t*    trx,    
01626   roll_ptr_t  roll_ptr, 
01627   mtr_t*    mtr)    
01628 {
01629   byte* log_ptr;
01630   page_t* page  = page_align(rec);
01631   ut_ad(flags < 256);
01632   ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
01633 
01634   log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
01635               ? MLOG_COMP_REC_UPDATE_IN_PLACE
01636               : MLOG_REC_UPDATE_IN_PLACE,
01637               1 + DATA_ROLL_PTR_LEN + 14 + 2
01638               + MLOG_BUF_MARGIN);
01639 
01640   if (!log_ptr) {
01641     /* Logging in mtr is switched off during crash recovery */
01642     return;
01643   }
01644 
01645   /* The code below assumes index is a clustered index: change index to
01646   the clustered index if we are updating a secondary index record (or we
01647   could as well skip writing the sys col values to the log in this case
01648   because they are not needed for a secondary index record update) */
01649 
01650   index = dict_table_get_first_index(index->table);
01651 
01652   mach_write_to_1(log_ptr, flags);
01653   log_ptr++;
01654 
01655   log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
01656             mtr);
01657   mach_write_to_2(log_ptr, page_offset(rec));
01658   log_ptr += 2;
01659 
01660   row_upd_index_write_log(update, log_ptr, mtr);
01661 }
01662 #endif /* UNIV_HOTBACKUP */
01663 
01664 /***********************************************************/
01667 UNIV_INTERN
01668 byte*
01669 btr_cur_parse_update_in_place(
01670 /*==========================*/
01671   byte*   ptr,  
01672   byte*   end_ptr,
01673   page_t*   page, 
01674   page_zip_des_t* page_zip,
01675   dict_index_t* index)  
01676 {
01677   ulint   flags;
01678   rec_t*    rec;
01679   upd_t*    update;
01680   ulint   pos;
01681   trx_id_t  trx_id;
01682   roll_ptr_t  roll_ptr;
01683   ulint   rec_offset;
01684   mem_heap_t* heap;
01685   ulint*    offsets;
01686 
01687   if (end_ptr < ptr + 1) {
01688 
01689     return(NULL);
01690   }
01691 
01692   flags = mach_read_from_1(ptr);
01693   ptr++;
01694 
01695   ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
01696 
01697   if (ptr == NULL) {
01698 
01699     return(NULL);
01700   }
01701 
01702   if (end_ptr < ptr + 2) {
01703 
01704     return(NULL);
01705   }
01706 
01707   rec_offset = mach_read_from_2(ptr);
01708   ptr += 2;
01709 
01710   ut_a(rec_offset <= UNIV_PAGE_SIZE);
01711 
01712   heap = mem_heap_create(256);
01713 
01714   ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
01715 
01716   if (!ptr || !page) {
01717 
01718     goto func_exit;
01719   }
01720 
01721   ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
01722   rec = page + rec_offset;
01723 
01724   /* We do not need to reserve btr_search_latch, as the page is only
01725   being recovered, and there cannot be a hash index to it. */
01726 
01727   offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
01728 
01729   if (!(flags & BTR_KEEP_SYS_FLAG)) {
01730     row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
01731                pos, trx_id, roll_ptr);
01732   }
01733 
01734   row_upd_rec_in_place(rec, index, offsets, update, page_zip);
01735 
01736 func_exit:
01737   mem_heap_free(heap);
01738 
01739   return(ptr);
01740 }
01741 
01742 #ifndef UNIV_HOTBACKUP
01743 /*************************************************************/
01747 UNIV_INTERN
01748 ibool
01749 btr_cur_update_alloc_zip(
01750 /*=====================*/
01751   page_zip_des_t* page_zip,
01752   buf_block_t*  block,  
01753   dict_index_t* index,  
01754   ulint   length, 
01755   ibool   create, 
01757   mtr_t*    mtr)  
01758 {
01759   ut_a(page_zip == buf_block_get_page_zip(block));
01760   ut_ad(page_zip);
01761   ut_ad(!dict_index_is_ibuf(index));
01762 
01763   if (page_zip_available(page_zip, dict_index_is_clust(index),
01764              length, create)) {
01765     return(TRUE);
01766   }
01767 
01768   if (!page_zip->m_nonempty) {
01769     /* The page has been freshly compressed, so
01770     recompressing it will not help. */
01771     return(FALSE);
01772   }
01773 
01774   if (!page_zip_compress(page_zip, buf_block_get_frame(block),
01775              index, mtr)) {
01776     /* Unable to compress the page */
01777     return(FALSE);
01778   }
01779 
01780   /* After recompressing a page, we must make sure that the free
01781   bits in the insert buffer bitmap will not exceed the free
01782   space on the page.  Because this function will not attempt
01783   recompression unless page_zip_available() fails above, it is
01784   safe to reset the free bits if page_zip_available() fails
01785   again, below.  The free bits can safely be reset in a separate
01786   mini-transaction.  If page_zip_available() succeeds below, we
01787   can be sure that the page_zip_compress() above did not reduce
01788   the free space available on the page. */
01789 
01790   if (!page_zip_available(page_zip, dict_index_is_clust(index),
01791         length, create)) {
01792     /* Out of space: reset the free bits. */
01793     if (!dict_index_is_clust(index)
01794         && page_is_leaf(buf_block_get_frame(block))) {
01795       ibuf_reset_free_bits(block);
01796     }
01797     return(FALSE);
01798   }
01799 
01800   return(TRUE);
01801 }
01802 
01803 /*************************************************************/
01807 UNIV_INTERN
01808 ulint
01809 btr_cur_update_in_place(
01810 /*====================*/
01811   ulint   flags,  
01812   btr_cur_t*  cursor, 
01815   const upd_t*  update, 
01816   ulint   cmpl_info,
01818   que_thr_t*  thr,  
01819   mtr_t*    mtr)  
01821 {
01822   dict_index_t* index;
01823   buf_block_t*  block;
01824   page_zip_des_t* page_zip;
01825   ulint   err;
01826   rec_t*    rec;
01827   roll_ptr_t  roll_ptr  = 0;
01828   trx_t*    trx;
01829   ulint   was_delete_marked;
01830   mem_heap_t* heap    = NULL;
01831   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
01832   ulint*    offsets   = offsets_;
01833   rec_offs_init(offsets_);
01834 
01835   rec = btr_cur_get_rec(cursor);
01836   index = cursor->index;
01837   ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
01838   /* The insert buffer tree should never be updated in place. */
01839   ut_ad(!dict_index_is_ibuf(index));
01840 
01841   trx = thr_get_trx(thr);
01842   offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
01843 #ifdef UNIV_DEBUG
01844   if (btr_cur_print_record_ops && thr) {
01845     btr_cur_trx_report(trx, index, "update ");
01846     rec_print_new(stderr, rec, offsets);
01847   }
01848 #endif /* UNIV_DEBUG */
01849 
01850   block = btr_cur_get_block(cursor);
01851   page_zip = buf_block_get_page_zip(block);
01852 
01853   /* Check that enough space is available on the compressed page. */
01854   if (UNIV_LIKELY_NULL(page_zip)
01855       && !btr_cur_update_alloc_zip(page_zip, block, index,
01856            rec_offs_size(offsets), FALSE, mtr)) {
01857     return(DB_ZIP_OVERFLOW);
01858   }
01859 
01860   /* Do lock checking and undo logging */
01861   err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
01862           thr, mtr, &roll_ptr);
01863   if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
01864 
01865     if (UNIV_LIKELY_NULL(heap)) {
01866       mem_heap_free(heap);
01867     }
01868     return(err);
01869   }
01870 
01871   if (block->is_hashed) {
01872     /* The function row_upd_changes_ord_field_binary works only
01873     if the update vector was built for a clustered index, we must
01874     NOT call it if index is secondary */
01875 
01876     if (!dict_index_is_clust(index)
01877         || row_upd_changes_ord_field_binary(NULL, index, update)) {
01878 
01879       /* Remove possible hash index pointer to this record */
01880       btr_search_update_hash_on_delete(cursor);
01881     }
01882 
01883     rw_lock_x_lock(&btr_search_latch);
01884   }
01885 
01886   if (!(flags & BTR_KEEP_SYS_FLAG)) {
01887     row_upd_rec_sys_fields(rec, NULL,
01888                index, offsets, trx, roll_ptr);
01889   }
01890 
01891   was_delete_marked = rec_get_deleted_flag(
01892     rec, page_is_comp(buf_block_get_frame(block)));
01893 
01894   row_upd_rec_in_place(rec, index, offsets, update, page_zip);
01895 
01896   if (block->is_hashed) {
01897     rw_lock_x_unlock(&btr_search_latch);
01898   }
01899 
01900   if (page_zip && !dict_index_is_clust(index)
01901       && page_is_leaf(buf_block_get_frame(block))) {
01902     /* Update the free bits in the insert buffer. */
01903     ibuf_update_free_bits_zip(block, mtr);
01904   }
01905 
01906   btr_cur_update_in_place_log(flags, rec, index, update,
01907             trx, roll_ptr, mtr);
01908 
01909   if (was_delete_marked
01910       && !rec_get_deleted_flag(rec, page_is_comp(
01911                buf_block_get_frame(block)))) {
01912     /* The new updated record owns its possible externally
01913     stored fields */
01914 
01915     btr_cur_unmark_extern_fields(page_zip,
01916                rec, index, offsets, mtr);
01917   }
01918 
01919   if (UNIV_LIKELY_NULL(heap)) {
01920     mem_heap_free(heap);
01921   }
01922   return(DB_SUCCESS);
01923 }
01924 
01925 /*************************************************************/
01934 UNIV_INTERN
01935 ulint
01936 btr_cur_optimistic_update(
01937 /*======================*/
01938   ulint   flags,  
01939   btr_cur_t*  cursor, 
01942   const upd_t*  update, 
01944   ulint   cmpl_info,
01946   que_thr_t*  thr,  
01947   mtr_t*    mtr)  
01949 {
01950   dict_index_t* index;
01951   page_cur_t* page_cursor;
01952   ulint   err;
01953   buf_block_t*  block;
01954   page_t*   page;
01955   page_zip_des_t* page_zip;
01956   rec_t*    rec;
01957   ulint   max_size;
01958   ulint   new_rec_size;
01959   ulint   old_rec_size;
01960   dtuple_t* new_entry;
01961   roll_ptr_t  roll_ptr;
01962   trx_t*    trx;
01963   mem_heap_t* heap;
01964   ulint   i;
01965   ulint   n_ext;
01966   ulint*    offsets;
01967 
01968   block = btr_cur_get_block(cursor);
01969   page = buf_block_get_frame(block);
01970   rec = btr_cur_get_rec(cursor);
01971   index = cursor->index;
01972   ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
01973   ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
01974   /* The insert buffer tree should never be updated in place. */
01975   ut_ad(!dict_index_is_ibuf(index));
01976 
01977   heap = mem_heap_create(1024);
01978   offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
01979 
01980 #ifdef UNIV_DEBUG
01981   if (btr_cur_print_record_ops && thr) {
01982     btr_cur_trx_report(thr_get_trx(thr), index, "update ");
01983     rec_print_new(stderr, rec, offsets);
01984   }
01985 #endif /* UNIV_DEBUG */
01986 
01987   if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
01988 
01989     /* The simplest and the most common case: the update does not
01990     change the size of any field and none of the updated fields is
01991     externally stored in rec or update, and there is enough space
01992     on the compressed page to log the update. */
01993 
01994     mem_heap_free(heap);
01995     return(btr_cur_update_in_place(flags, cursor, update,
01996                  cmpl_info, thr, mtr));
01997   }
01998 
01999   if (rec_offs_any_extern(offsets)) {
02000 any_extern:
02001     /* Externally stored fields are treated in pessimistic
02002     update */
02003 
02004     mem_heap_free(heap);
02005     return(DB_OVERFLOW);
02006   }
02007 
02008   for (i = 0; i < upd_get_n_fields(update); i++) {
02009     if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
02010 
02011       goto any_extern;
02012     }
02013   }
02014 
02015   page_cursor = btr_cur_get_page_cur(cursor);
02016 
02017   new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
02018              &n_ext, heap);
02019   /* We checked above that there are no externally stored fields. */
02020   ut_a(!n_ext);
02021 
02022   /* The page containing the clustered index record
02023   corresponding to new_entry is latched in mtr.
02024   Thus the following call is safe. */
02025   row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
02026                  FALSE, heap);
02027   old_rec_size = rec_offs_size(offsets);
02028   new_rec_size = rec_get_converted_size(index, new_entry, 0);
02029 
02030   page_zip = buf_block_get_page_zip(block);
02031 #ifdef UNIV_ZIP_DEBUG
02032   ut_a(!page_zip || page_zip_validate(page_zip, page));
02033 #endif /* UNIV_ZIP_DEBUG */
02034 
02035   if (UNIV_LIKELY_NULL(page_zip)
02036       && !btr_cur_update_alloc_zip(page_zip, block, index,
02037            new_rec_size, TRUE, mtr)) {
02038     err = DB_ZIP_OVERFLOW;
02039     goto err_exit;
02040   }
02041 
02042   if (UNIV_UNLIKELY(new_rec_size
02043         >= (page_get_free_space_of_empty(page_is_comp(page))
02044             / 2))) {
02045 
02046     err = DB_OVERFLOW;
02047     goto err_exit;
02048   }
02049 
02050   if (UNIV_UNLIKELY(page_get_data_size(page)
02051         - old_rec_size + new_rec_size
02052         < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
02053 
02054     /* The page would become too empty */
02055 
02056     err = DB_UNDERFLOW;
02057     goto err_exit;
02058   }
02059 
02060   max_size = old_rec_size
02061     + page_get_max_insert_size_after_reorganize(page, 1);
02062 
02063   if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
02064          && (max_size >= new_rec_size))
02065         || (page_get_n_recs(page) <= 1))) {
02066 
02067     /* There was not enough space, or it did not pay to
02068     reorganize: for simplicity, we decide what to do assuming a
02069     reorganization is needed, though it might not be necessary */
02070 
02071     err = DB_OVERFLOW;
02072     goto err_exit;
02073   }
02074 
02075   /* Do lock checking and undo logging */
02076   err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
02077           thr, mtr, &roll_ptr);
02078   if (err != DB_SUCCESS) {
02079 
02080     goto err_exit;
02081   }
02082 
02083   /* Ok, we may do the replacement. Store on the page infimum the
02084   explicit locks on rec, before deleting rec (see the comment in
02085   btr_cur_pessimistic_update). */
02086 
02087   lock_rec_store_on_page_infimum(block, rec);
02088 
02089   btr_search_update_hash_on_delete(cursor);
02090 
02091   /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
02092   invokes rec_offs_make_valid() to point to the copied record that
02093   the fields of new_entry point to.  We have to undo it here. */
02094   ut_ad(rec_offs_validate(NULL, index, offsets));
02095   rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets);
02096 
02097   page_cur_delete_rec(page_cursor, index, offsets, mtr);
02098 
02099   page_cur_move_to_prev(page_cursor);
02100 
02101   trx = thr_get_trx(thr);
02102 
02103   if (!(flags & BTR_KEEP_SYS_FLAG)) {
02104     row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
02105                 roll_ptr);
02106     row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
02107                 trx->id);
02108   }
02109 
02110   /* There are no externally stored columns in new_entry */
02111   rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr);
02112   ut_a(rec); /* <- We calculated above the insert would fit */
02113 
02114   if (page_zip && !dict_index_is_clust(index)
02115       && page_is_leaf(page)) {
02116     /* Update the free bits in the insert buffer. */
02117     ibuf_update_free_bits_zip(block, mtr);
02118   }
02119 
02120   /* Restore the old explicit lock state on the record */
02121 
02122   lock_rec_restore_from_page_infimum(block, rec, block);
02123 
02124   page_cur_move_to_next(page_cursor);
02125 
02126   err = DB_SUCCESS;
02127 err_exit:
02128   mem_heap_free(heap);
02129   return(err);
02130 }
02131 
02132 /*************************************************************/
02138 static
02139 void
02140 btr_cur_pess_upd_restore_supremum(
02141 /*==============================*/
02142   buf_block_t*  block,  
02143   const rec_t*  rec,  
02144   mtr_t*    mtr)  
02145 {
02146   page_t*   page;
02147   buf_block_t*  prev_block;
02148   ulint   space;
02149   ulint   zip_size;
02150   ulint   prev_page_no;
02151 
02152   page = buf_block_get_frame(block);
02153 
02154   if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
02155     /* Updated record is not the first user record on its page */
02156 
02157     return;
02158   }
02159 
02160   space = buf_block_get_space(block);
02161   zip_size = buf_block_get_zip_size(block);
02162   prev_page_no = btr_page_get_prev(page, mtr);
02163 
02164   ut_ad(prev_page_no != FIL_NULL);
02165   prev_block = buf_page_get_with_no_latch(space, zip_size,
02166             prev_page_no, mtr);
02167 #ifdef UNIV_BTR_DEBUG
02168   ut_a(btr_page_get_next(prev_block->frame, mtr)
02169        == page_get_page_no(page));
02170 #endif /* UNIV_BTR_DEBUG */
02171 
02172   /* We must already have an x-latch on prev_block! */
02173   ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
02174 
02175   lock_rec_reset_and_inherit_gap_locks(prev_block, block,
02176                PAGE_HEAP_NO_SUPREMUM,
02177                page_rec_get_heap_no(rec));
02178 }
02179 
02180 /*************************************************************/
02187 UNIV_INTERN
02188 ulint
02189 btr_cur_pessimistic_update(
02190 /*=======================*/
02191   ulint   flags,  
02193   btr_cur_t*  cursor, 
02194   mem_heap_t**  heap, 
02195   big_rec_t** big_rec,
02197   const upd_t*  update, 
02200   ulint   cmpl_info,
02202   que_thr_t*  thr,  
02203   mtr_t*    mtr)  
02205 {
02206   big_rec_t*  big_rec_vec = NULL;
02207   big_rec_t*  dummy_big_rec;
02208   dict_index_t* index;
02209   buf_block_t*  block;
02210   page_t*   page;
02211   page_zip_des_t* page_zip;
02212   rec_t*    rec;
02213   page_cur_t* page_cursor;
02214   dtuple_t* new_entry;
02215   ulint   err;
02216   ulint   optim_err;
02217   roll_ptr_t  roll_ptr;
02218   trx_t*    trx;
02219   ibool   was_first;
02220   ulint   n_extents = 0;
02221   ulint   n_reserved;
02222   ulint   n_ext;
02223   ulint*    offsets   = NULL;
02224 
02225   *big_rec = NULL;
02226 
02227   block = btr_cur_get_block(cursor);
02228   page = buf_block_get_frame(block);
02229   page_zip = buf_block_get_page_zip(block);
02230   rec = btr_cur_get_rec(cursor);
02231   index = cursor->index;
02232 
02233   ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
02234         MTR_MEMO_X_LOCK));
02235   ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
02236 #ifdef UNIV_ZIP_DEBUG
02237   ut_a(!page_zip || page_zip_validate(page_zip, page));
02238 #endif /* UNIV_ZIP_DEBUG */
02239   /* The insert buffer tree should never be updated in place. */
02240   ut_ad(!dict_index_is_ibuf(index));
02241 
02242   optim_err = btr_cur_optimistic_update(flags, cursor, update,
02243                 cmpl_info, thr, mtr);
02244 
02245   switch (optim_err) {
02246   case DB_UNDERFLOW:
02247   case DB_OVERFLOW:
02248   case DB_ZIP_OVERFLOW:
02249     break;
02250   default:
02251     return(optim_err);
02252   }
02253 
02254   /* Do lock checking and undo logging */
02255   err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
02256           thr, mtr, &roll_ptr);
02257   if (err != DB_SUCCESS) {
02258 
02259     return(err);
02260   }
02261 
02262   if (optim_err == DB_OVERFLOW) {
02263     ulint reserve_flag;
02264 
02265     /* First reserve enough free space for the file segments
02266     of the index tree, so that the update will not fail because
02267     of lack of space */
02268 
02269     n_extents = cursor->tree_height / 16 + 3;
02270 
02271     if (flags & BTR_NO_UNDO_LOG_FLAG) {
02272       reserve_flag = FSP_CLEANING;
02273     } else {
02274       reserve_flag = FSP_NORMAL;
02275     }
02276 
02277     if (!fsp_reserve_free_extents(&n_reserved, index->space,
02278                 n_extents, reserve_flag, mtr)) {
02279       return(DB_OUT_OF_FILE_SPACE);
02280     }
02281   }
02282 
02283   if (!*heap) {
02284     *heap = mem_heap_create(1024);
02285   }
02286   offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap);
02287 
02288   trx = thr_get_trx(thr);
02289 
02290   new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
02291              &n_ext, *heap);
02292   /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
02293   invokes rec_offs_make_valid() to point to the copied record that
02294   the fields of new_entry point to.  We have to undo it here. */
02295   ut_ad(rec_offs_validate(NULL, index, offsets));
02296   rec_offs_make_valid(rec, index, offsets);
02297 
02298   /* The page containing the clustered index record
02299   corresponding to new_entry is latched in mtr.  If the
02300   clustered index record is delete-marked, then its externally
02301   stored fields cannot have been purged yet, because then the
02302   purge would also have removed the clustered index record
02303   itself.  Thus the following call is safe. */
02304   row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
02305                  FALSE, *heap);
02306   if (!(flags & BTR_KEEP_SYS_FLAG)) {
02307     row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
02308                 roll_ptr);
02309     row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
02310                 trx->id);
02311   }
02312 
02313   if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) {
02314     /* We are in a transaction rollback undoing a row
02315     update: we must free possible externally stored fields
02316     which got new values in the update, if they are not
02317     inherited values. They can be inherited if we have
02318     updated the primary key to another value, and then
02319     update it back again. */
02320 
02321     ut_ad(big_rec_vec == NULL);
02322 
02323     btr_rec_free_updated_extern_fields(
02324       index, rec, page_zip, offsets, update,
02325       trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
02326   }
02327 
02328   /* We have to set appropriate extern storage bits in the new
02329   record to be inserted: we have to remember which fields were such */
02330 
02331   ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
02332   offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap);
02333   n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
02334 
02335   if (UNIV_LIKELY_NULL(page_zip)) {
02336     ut_ad(page_is_comp(page));
02337     if (page_zip_rec_needs_ext(
02338           rec_get_converted_size(index, new_entry, n_ext),
02339           TRUE,
02340           dict_index_get_n_fields(index),
02341           page_zip_get_size(page_zip))) {
02342 
02343       goto make_external;
02344     }
02345   } else if (page_zip_rec_needs_ext(
02346          rec_get_converted_size(index, new_entry, n_ext),
02347          page_is_comp(page), 0, 0)) {
02348 make_external:
02349     big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
02350     if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
02351 
02352       err = DB_TOO_BIG_RECORD;
02353       goto return_after_reservations;
02354     }
02355   }
02356 
02357   /* Store state of explicit locks on rec on the page infimum record,
02358   before deleting rec. The page infimum acts as a dummy carrier of the
02359   locks, taking care also of lock releases, before we can move the locks
02360   back on the actual record. There is a special case: if we are
02361   inserting on the root page and the insert causes a call of
02362   btr_root_raise_and_insert. Therefore we cannot in the lock system
02363   delete the lock structs set on the root page even if the root
02364   page carries just node pointers. */
02365 
02366   lock_rec_store_on_page_infimum(block, rec);
02367 
02368   btr_search_update_hash_on_delete(cursor);
02369 
02370 #ifdef UNIV_ZIP_DEBUG
02371   ut_a(!page_zip || page_zip_validate(page_zip, page));
02372 #endif /* UNIV_ZIP_DEBUG */
02373   page_cursor = btr_cur_get_page_cur(cursor);
02374 
02375   page_cur_delete_rec(page_cursor, index, offsets, mtr);
02376 
02377   page_cur_move_to_prev(page_cursor);
02378 
02379   rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
02380 
02381   if (rec) {
02382     lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
02383                rec, block);
02384 
02385     offsets = rec_get_offsets(rec, index, offsets,
02386             ULINT_UNDEFINED, heap);
02387 
02388     if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
02389       /* The new inserted record owns its possible externally
02390       stored fields */
02391       btr_cur_unmark_extern_fields(page_zip,
02392                  rec, index, offsets, mtr);
02393     }
02394 
02395     btr_cur_compress_if_useful(cursor, mtr);
02396 
02397     if (page_zip && !dict_index_is_clust(index)
02398         && page_is_leaf(page)) {
02399       /* Update the free bits in the insert buffer. */
02400       ibuf_update_free_bits_zip(block, mtr);
02401     }
02402 
02403     err = DB_SUCCESS;
02404     goto return_after_reservations;
02405   } else {
02406     ut_a(optim_err != DB_UNDERFLOW);
02407 
02408     /* Out of space: reset the free bits. */
02409     if (!dict_index_is_clust(index)
02410         && page_is_leaf(page)) {
02411       ibuf_reset_free_bits(block);
02412     }
02413   }
02414 
02415   /* Was the record to be updated positioned as the first user
02416   record on its page? */
02417   was_first = page_cur_is_before_first(page_cursor);
02418 
02419   /* The first parameter means that no lock checking and undo logging
02420   is made in the insert */
02421 
02422   err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
02423            | BTR_NO_LOCKING_FLAG
02424            | BTR_KEEP_SYS_FLAG,
02425            cursor, new_entry, &rec,
02426            &dummy_big_rec, n_ext, NULL, mtr);
02427   ut_a(rec);
02428   ut_a(err == DB_SUCCESS);
02429   ut_a(dummy_big_rec == NULL);
02430 
02431   if (dict_index_is_sec_or_ibuf(index)) {
02432     /* Update PAGE_MAX_TRX_ID in the index page header.
02433     It was not updated by btr_cur_pessimistic_insert()
02434     because of BTR_NO_LOCKING_FLAG. */
02435     buf_block_t*  rec_block;
02436 
02437     rec_block = btr_cur_get_block(cursor);
02438 
02439     page_update_max_trx_id(rec_block,
02440                buf_block_get_page_zip(rec_block),
02441                trx->id, mtr);
02442   }
02443 
02444   if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
02445     /* The new inserted record owns its possible externally
02446     stored fields */
02447     buf_block_t*  rec_block = btr_cur_get_block(cursor);
02448 
02449 #ifdef UNIV_ZIP_DEBUG
02450     ut_a(!page_zip || page_zip_validate(page_zip, page));
02451     page = buf_block_get_frame(rec_block);
02452 #endif /* UNIV_ZIP_DEBUG */
02453     page_zip = buf_block_get_page_zip(rec_block);
02454 
02455     offsets = rec_get_offsets(rec, index, offsets,
02456             ULINT_UNDEFINED, heap);
02457     btr_cur_unmark_extern_fields(page_zip,
02458                rec, index, offsets, mtr);
02459   }
02460 
02461   lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
02462              rec, block);
02463 
02464   /* If necessary, restore also the correct lock state for a new,
02465   preceding supremum record created in a page split. While the old
02466   record was nonexistent, the supremum might have inherited its locks
02467   from a wrong record. */
02468 
02469   if (!was_first) {
02470     btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
02471               rec, mtr);
02472   }
02473 
02474 return_after_reservations:
02475 #ifdef UNIV_ZIP_DEBUG
02476   ut_a(!page_zip || page_zip_validate(page_zip, page));
02477 #endif /* UNIV_ZIP_DEBUG */
02478 
02479   if (n_extents > 0) {
02480     fil_space_release_free_extents(index->space, n_reserved);
02481   }
02482 
02483   *big_rec = big_rec_vec;
02484 
02485   return(err);
02486 }
02487 
02488 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
02489 
02490 /****************************************************************/
02493 UNIV_INLINE
02494 void
02495 btr_cur_del_mark_set_clust_rec_log(
02496 /*===============================*/
02497   ulint   flags,  
02498   rec_t*    rec,  
02499   dict_index_t* index,  
02500   ibool   val,  
02501   trx_t*    trx,  
02502   roll_ptr_t  roll_ptr,
02503   mtr_t*    mtr)  
02504 {
02505   byte* log_ptr;
02506   ut_ad(flags < 256);
02507   ut_ad(val <= 1);
02508 
02509   ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
02510 
02511   log_ptr = mlog_open_and_write_index(mtr, rec, index,
02512               page_rec_is_comp(rec)
02513               ? MLOG_COMP_REC_CLUST_DELETE_MARK
02514               : MLOG_REC_CLUST_DELETE_MARK,
02515               1 + 1 + DATA_ROLL_PTR_LEN
02516               + 14 + 2);
02517 
02518   if (!log_ptr) {
02519     /* Logging in mtr is switched off during crash recovery */
02520     return;
02521   }
02522 
02523   mach_write_to_1(log_ptr, flags);
02524   log_ptr++;
02525   mach_write_to_1(log_ptr, val);
02526   log_ptr++;
02527 
02528   log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
02529             mtr);
02530   mach_write_to_2(log_ptr, page_offset(rec));
02531   log_ptr += 2;
02532 
02533   mlog_close(mtr, log_ptr);
02534 }
02535 #endif /* !UNIV_HOTBACKUP */
02536 
02537 /****************************************************************/
02541 UNIV_INTERN
02542 byte*
02543 btr_cur_parse_del_mark_set_clust_rec(
02544 /*=================================*/
02545   byte*   ptr,  
02546   byte*   end_ptr,
02547   page_t*   page, 
02548   page_zip_des_t* page_zip,
02549   dict_index_t* index)  
02550 {
02551   ulint   flags;
02552   ulint   val;
02553   ulint   pos;
02554   trx_id_t  trx_id;
02555   roll_ptr_t  roll_ptr;
02556   ulint   offset;
02557   rec_t*    rec;
02558 
02559   ut_ad(!page
02560         || !!page_is_comp(page) == dict_table_is_comp(index->table));
02561 
02562   if (end_ptr < ptr + 2) {
02563 
02564     return(NULL);
02565   }
02566 
02567   flags = mach_read_from_1(ptr);
02568   ptr++;
02569   val = mach_read_from_1(ptr);
02570   ptr++;
02571 
02572   ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
02573 
02574   if (ptr == NULL) {
02575 
02576     return(NULL);
02577   }
02578 
02579   if (end_ptr < ptr + 2) {
02580 
02581     return(NULL);
02582   }
02583 
02584   offset = mach_read_from_2(ptr);
02585   ptr += 2;
02586 
02587   ut_a(offset <= UNIV_PAGE_SIZE);
02588 
02589   if (page) {
02590     rec = page + offset;
02591 
02592     /* We do not need to reserve btr_search_latch, as the page
02593     is only being recovered, and there cannot be a hash index to
02594     it. */
02595 
02596     btr_rec_set_deleted_flag(rec, page_zip, val);
02597 
02598     if (!(flags & BTR_KEEP_SYS_FLAG)) {
02599       mem_heap_t* heap    = NULL;
02600       ulint   offsets_[REC_OFFS_NORMAL_SIZE];
02601       rec_offs_init(offsets_);
02602 
02603       row_upd_rec_sys_fields_in_recovery(
02604         rec, page_zip,
02605         rec_get_offsets(rec, index, offsets_,
02606             ULINT_UNDEFINED, &heap),
02607         pos, trx_id, roll_ptr);
02608       if (UNIV_LIKELY_NULL(heap)) {
02609         mem_heap_free(heap);
02610       }
02611     }
02612   }
02613 
02614   return(ptr);
02615 }
02616 
02617 #ifndef UNIV_HOTBACKUP
02618 /***********************************************************/
02624 UNIV_INTERN
02625 ulint
02626 btr_cur_del_mark_set_clust_rec(
02627 /*===========================*/
02628   ulint   flags,  
02629   btr_cur_t*  cursor, 
02630   ibool   val,  
02631   que_thr_t*  thr,  
02632   mtr_t*    mtr)  
02633 {
02634   dict_index_t* index;
02635   buf_block_t*  block;
02636   roll_ptr_t  roll_ptr;
02637   ulint   err;
02638   rec_t*    rec;
02639   page_zip_des_t* page_zip;
02640   trx_t*    trx;
02641   mem_heap_t* heap    = NULL;
02642   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
02643   ulint*    offsets   = offsets_;
02644   rec_offs_init(offsets_);
02645 
02646   rec = btr_cur_get_rec(cursor);
02647   index = cursor->index;
02648   ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
02649   offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
02650 
02651 #ifdef UNIV_DEBUG
02652   if (btr_cur_print_record_ops && thr) {
02653     btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
02654     rec_print_new(stderr, rec, offsets);
02655   }
02656 #endif /* UNIV_DEBUG */
02657 
02658   ut_ad(dict_index_is_clust(index));
02659   ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
02660 
02661   err = lock_clust_rec_modify_check_and_lock(flags,
02662                btr_cur_get_block(cursor),
02663                rec, index, offsets, thr);
02664 
02665   if (err != DB_SUCCESS) {
02666 
02667     goto func_exit;
02668   }
02669 
02670   err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
02671               index, NULL, NULL, 0, rec,
02672               &roll_ptr);
02673   if (err != DB_SUCCESS) {
02674 
02675     goto func_exit;
02676   }
02677 
02678   block = btr_cur_get_block(cursor);
02679 
02680   if (block->is_hashed) {
02681     rw_lock_x_lock(&btr_search_latch);
02682   }
02683 
02684   page_zip = buf_block_get_page_zip(block);
02685 
02686   btr_rec_set_deleted_flag(rec, page_zip, val);
02687 
02688   trx = thr_get_trx(thr);
02689 
02690   if (!(flags & BTR_KEEP_SYS_FLAG)) {
02691     row_upd_rec_sys_fields(rec, page_zip,
02692                index, offsets, trx, roll_ptr);
02693   }
02694 
02695   if (block->is_hashed) {
02696     rw_lock_x_unlock(&btr_search_latch);
02697   }
02698 
02699   btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
02700              roll_ptr, mtr);
02701 
02702 func_exit:
02703   if (UNIV_LIKELY_NULL(heap)) {
02704     mem_heap_free(heap);
02705   }
02706   return(err);
02707 }
02708 
02709 /****************************************************************/
02712 UNIV_INLINE
02713 void
02714 btr_cur_del_mark_set_sec_rec_log(
02715 /*=============================*/
02716   rec_t*    rec,  
02717   ibool   val,  
02718   mtr_t*    mtr)  
02719 {
02720   byte* log_ptr;
02721   ut_ad(val <= 1);
02722 
02723   log_ptr = mlog_open(mtr, 11 + 1 + 2);
02724 
02725   if (!log_ptr) {
02726     /* Logging in mtr is switched off during crash recovery:
02727     in that case mlog_open returns NULL */
02728     return;
02729   }
02730 
02731   log_ptr = mlog_write_initial_log_record_fast(
02732     rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
02733   mach_write_to_1(log_ptr, val);
02734   log_ptr++;
02735 
02736   mach_write_to_2(log_ptr, page_offset(rec));
02737   log_ptr += 2;
02738 
02739   mlog_close(mtr, log_ptr);
02740 }
02741 #endif /* !UNIV_HOTBACKUP */
02742 
02743 /****************************************************************/
02747 UNIV_INTERN
02748 byte*
02749 btr_cur_parse_del_mark_set_sec_rec(
02750 /*===============================*/
02751   byte*   ptr,  
02752   byte*   end_ptr,
02753   page_t*   page, 
02754   page_zip_des_t* page_zip)
02755 {
02756   ulint val;
02757   ulint offset;
02758   rec_t*  rec;
02759 
02760   if (end_ptr < ptr + 3) {
02761 
02762     return(NULL);
02763   }
02764 
02765   val = mach_read_from_1(ptr);
02766   ptr++;
02767 
02768   offset = mach_read_from_2(ptr);
02769   ptr += 2;
02770 
02771   ut_a(offset <= UNIV_PAGE_SIZE);
02772 
02773   if (page) {
02774     rec = page + offset;
02775 
02776     /* We do not need to reserve btr_search_latch, as the page
02777     is only being recovered, and there cannot be a hash index to
02778     it. */
02779 
02780     btr_rec_set_deleted_flag(rec, page_zip, val);
02781   }
02782 
02783   return(ptr);
02784 }
02785 
02786 #ifndef UNIV_HOTBACKUP
02787 /***********************************************************/
02790 UNIV_INTERN
02791 ulint
02792 btr_cur_del_mark_set_sec_rec(
02793 /*=========================*/
02794   ulint   flags,  
02795   btr_cur_t*  cursor, 
02796   ibool   val,  
02797   que_thr_t*  thr,  
02798   mtr_t*    mtr)  
02799 {
02800   buf_block_t*  block;
02801   rec_t*    rec;
02802   ulint   err;
02803 
02804   block = btr_cur_get_block(cursor);
02805   rec = btr_cur_get_rec(cursor);
02806 
02807 #ifdef UNIV_DEBUG
02808   if (btr_cur_print_record_ops && thr) {
02809     btr_cur_trx_report(thr_get_trx(thr), cursor->index,
02810            "del mark ");
02811     rec_print(stderr, rec, cursor->index);
02812   }
02813 #endif /* UNIV_DEBUG */
02814 
02815   err = lock_sec_rec_modify_check_and_lock(flags,
02816              btr_cur_get_block(cursor),
02817              rec, cursor->index, thr, mtr);
02818   if (err != DB_SUCCESS) {
02819 
02820     return(err);
02821   }
02822 
02823   ut_ad(!!page_rec_is_comp(rec)
02824         == dict_table_is_comp(cursor->index->table));
02825 
02826   if (block->is_hashed) {
02827     rw_lock_x_lock(&btr_search_latch);
02828   }
02829 
02830   btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
02831 
02832   if (block->is_hashed) {
02833     rw_lock_x_unlock(&btr_search_latch);
02834   }
02835 
02836   btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
02837 
02838   return(DB_SUCCESS);
02839 }
02840 
02841 /***********************************************************/
02844 UNIV_INTERN
02845 void
02846 btr_cur_set_deleted_flag_for_ibuf(
02847 /*==============================*/
02848   rec_t*    rec,    
02849   page_zip_des_t* page_zip, 
02853   ibool   val,    
02854   mtr_t*    mtr)    
02855 {
02856   /* We do not need to reserve btr_search_latch, as the page has just
02857   been read to the buffer pool and there cannot be a hash index to it. */
02858 
02859   btr_rec_set_deleted_flag(rec, page_zip, val);
02860 
02861   btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
02862 }
02863 
02864 /*==================== B-TREE RECORD REMOVE =========================*/
02865 
02866 /*************************************************************/
02873 UNIV_INTERN
02874 ibool
02875 btr_cur_compress_if_useful(
02876 /*=======================*/
02877   btr_cur_t*  cursor, 
02880   mtr_t*    mtr)  
02881 {
02882   ut_ad(mtr_memo_contains(mtr,
02883         dict_index_get_lock(btr_cur_get_index(cursor)),
02884         MTR_MEMO_X_LOCK));
02885   ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
02886         MTR_MEMO_PAGE_X_FIX));
02887 
02888   return(btr_cur_compress_recommendation(cursor, mtr)
02889          && btr_compress(cursor, mtr));
02890 }
02891 
02892 /*******************************************************/
02897 UNIV_INTERN
02898 ibool
02899 btr_cur_optimistic_delete(
02900 /*======================*/
02901   btr_cur_t*  cursor, 
02905   mtr_t*    mtr)  
02909 {
02910   buf_block_t*  block;
02911   rec_t*    rec;
02912   mem_heap_t* heap    = NULL;
02913   ulint   offsets_[REC_OFFS_NORMAL_SIZE];
02914   ulint*    offsets   = offsets_;
02915   ibool   no_compress_needed;
02916   rec_offs_init(offsets_);
02917 
02918   ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
02919         MTR_MEMO_PAGE_X_FIX));
02920   /* This is intended only for leaf page deletions */
02921 
02922   block = btr_cur_get_block(cursor);
02923 
02924   ut_ad(page_is_leaf(buf_block_get_frame(block)));
02925 
02926   rec = btr_cur_get_rec(cursor);
02927   offsets = rec_get_offsets(rec, cursor->index, offsets,
02928           ULINT_UNDEFINED, &heap);
02929 
02930   no_compress_needed = !rec_offs_any_extern(offsets)
02931     && btr_cur_can_delete_without_compress(
02932       cursor, rec_offs_size(offsets), mtr);
02933 
02934   if (no_compress_needed) {
02935 
02936     page_t*   page  = buf_block_get_frame(block);
02937     page_zip_des_t* page_zip= buf_block_get_page_zip(block);
02938     ulint   max_ins = 0;
02939 
02940     lock_update_delete(block, rec);
02941 
02942     btr_search_update_hash_on_delete(cursor);
02943 
02944     if (!page_zip) {
02945       max_ins = page_get_max_insert_size_after_reorganize(
02946         page, 1);
02947     }
02948 #ifdef UNIV_ZIP_DEBUG
02949     ut_a(!page_zip || page_zip_validate(page_zip, page));
02950 #endif /* UNIV_ZIP_DEBUG */
02951     page_cur_delete_rec(btr_cur_get_page_cur(cursor),
02952             cursor->index, offsets, mtr);
02953 #ifdef UNIV_ZIP_DEBUG
02954     ut_a(!page_zip || page_zip_validate(page_zip, page));
02955 #endif /* UNIV_ZIP_DEBUG */
02956 
02957     if (dict_index_is_clust(cursor->index)
02958         || dict_index_is_ibuf(cursor->index)
02959         || !page_is_leaf(page)) {
02960       /* The insert buffer does not handle
02961       inserts to clustered indexes, to
02962       non-leaf pages of secondary index B-trees,
02963       or to the insert buffer. */
02964     } else if (page_zip) {
02965       ibuf_update_free_bits_zip(block, mtr);
02966     } else {
02967       ibuf_update_free_bits_low(block, max_ins, mtr);
02968     }
02969   }
02970 
02971   if (UNIV_LIKELY_NULL(heap)) {
02972     mem_heap_free(heap);
02973   }
02974 
02975   return(no_compress_needed);
02976 }
02977 
02978 /*************************************************************/
02986 UNIV_INTERN
02987 ibool
02988 btr_cur_pessimistic_delete(
02989 /*=======================*/
02990   ulint*    err,  
02995   ibool   has_reserved_extents, 
02999   btr_cur_t*  cursor, 
03003   enum trx_rb_ctx rb_ctx, 
03004   mtr_t*    mtr)  
03005 {
03006   buf_block_t*  block;
03007   page_t*   page;
03008   page_zip_des_t* page_zip;
03009   dict_index_t* index;
03010   rec_t*    rec;
03011   dtuple_t* node_ptr;
03012   ulint   n_extents = 0;
03013   ulint   n_reserved;
03014   ibool   success;
03015   ibool   ret   = FALSE;
03016   ulint   level;
03017   mem_heap_t* heap;
03018   ulint*    offsets;
03019 
03020   block = btr_cur_get_block(cursor);
03021   page = buf_block_get_frame(block);
03022   index = btr_cur_get_index(cursor);
03023 
03024   ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
03025         MTR_MEMO_X_LOCK));
03026   ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
03027   if (!has_reserved_extents) {
03028     /* First reserve enough free space for the file segments
03029     of the index tree, so that the node pointer updates will
03030     not fail because of lack of space */
03031 
03032     n_extents = cursor->tree_height / 32 + 1;
03033 
03034     success = fsp_reserve_free_extents(&n_reserved,
03035                index->space,
03036                n_extents,
03037                FSP_CLEANING, mtr);
03038     if (!success) {
03039       *err = DB_OUT_OF_FILE_SPACE;
03040 
03041       return(FALSE);
03042     }
03043   }
03044 
03045   heap = mem_heap_create(1024);
03046   rec = btr_cur_get_rec(cursor);
03047   page_zip = buf_block_get_page_zip(block);
03048 #ifdef UNIV_ZIP_DEBUG
03049   ut_a(!page_zip || page_zip_validate(page_zip, page));
03050 #endif /* UNIV_ZIP_DEBUG */
03051 
03052   offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
03053 
03054   if (rec_offs_any_extern(offsets)) {
03055     btr_rec_free_externally_stored_fields(index,
03056                   rec, offsets, page_zip,
03057                   rb_ctx, mtr);
03058 #ifdef UNIV_ZIP_DEBUG
03059     ut_a(!page_zip || page_zip_validate(page_zip, page));
03060 #endif /* UNIV_ZIP_DEBUG */
03061   }
03062 
03063   if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
03064       && UNIV_UNLIKELY(dict_index_get_page(index)
03065            != buf_block_get_page_no(block))) {
03066 
03067     /* If there is only one record, drop the whole page in
03068     btr_discard_page, if this is not the root page */
03069 
03070     btr_discard_page(cursor, mtr);
03071 
03072     *err = DB_SUCCESS;
03073     ret = TRUE;
03074 
03075     goto return_after_reservations;
03076   }
03077 
03078   lock_update_delete(block, rec);
03079   level = btr_page_get_level(page, mtr);
03080 
03081   if (level > 0
03082       && UNIV_UNLIKELY(rec == page_rec_get_next(
03083              page_get_infimum_rec(page)))) {
03084 
03085     rec_t*  next_rec = page_rec_get_next(rec);
03086 
03087     if (btr_page_get_prev(page, mtr) == FIL_NULL) {
03088 
03089       /* If we delete the leftmost node pointer on a
03090       non-leaf level, we must mark the new leftmost node
03091       pointer as the predefined minimum record */
03092 
03093       /* This will make page_zip_validate() fail until
03094       page_cur_delete_rec() completes.  This is harmless,
03095       because everything will take place within a single
03096       mini-transaction and because writing to the redo log
03097       is an atomic operation (performed by mtr_commit()). */
03098       btr_set_min_rec_mark(next_rec, mtr);
03099     } else {
03100       /* Otherwise, if we delete the leftmost node pointer
03101       on a page, we have to change the father node pointer
03102       so that it is equal to the new leftmost node pointer
03103       on the page */
03104 
03105       btr_node_ptr_delete(index, block, mtr);
03106 
03107       node_ptr = dict_index_build_node_ptr(
03108         index, next_rec, buf_block_get_page_no(block),
03109         heap, level);
03110 
03111       btr_insert_on_non_leaf_level(index,
03112                  level + 1, node_ptr, mtr);
03113     }
03114   }
03115 
03116   btr_search_update_hash_on_delete(cursor);
03117 
03118   page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
03119 #ifdef UNIV_ZIP_DEBUG
03120   ut_a(!page_zip || page_zip_validate(page_zip, page));
03121 #endif /* UNIV_ZIP_DEBUG */
03122 
03123   ut_ad(btr_check_node_ptr(index, block, mtr));
03124 
03125   *err = DB_SUCCESS;
03126 
03127 return_after_reservations:
03128   mem_heap_free(heap);
03129 
03130   if (ret == FALSE) {
03131     ret = btr_cur_compress_if_useful(cursor, mtr);
03132   }
03133 
03134   if (n_extents > 0) {
03135     fil_space_release_free_extents(index->space, n_reserved);
03136   }
03137 
03138   return(ret);
03139 }
03140 
03141 /*******************************************************************/
03144 static
03145 void
03146 btr_cur_add_path_info(
03147 /*==================*/
03148   btr_cur_t*  cursor,   
03149   ulint   height,   
03151   ulint   root_height)  
03152 {
03153   btr_path_t* slot;
03154   rec_t*    rec;
03155   page_t*   page;
03156 
03157   ut_a(cursor->path_arr);
03158 
03159   if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
03160     /* Do nothing; return empty path */
03161 
03162     slot = cursor->path_arr;
03163     slot->nth_rec = ULINT_UNDEFINED;
03164 
03165     return;
03166   }
03167 
03168   if (height == 0) {
03169     /* Mark end of slots for path */
03170     slot = cursor->path_arr + root_height + 1;
03171     slot->nth_rec = ULINT_UNDEFINED;
03172   }
03173 
03174   rec = btr_cur_get_rec(cursor);
03175 
03176   slot = cursor->path_arr + (root_height - height);
03177 
03178   page = page_align(rec);
03179 
03180   slot->nth_rec = page_rec_get_n_recs_before(rec);
03181   slot->n_recs = page_get_n_recs(page);
03182   slot->page_no = page_get_page_no(page);
03183   slot->page_level = btr_page_get_level_low(page);
03184 }
03185 
03186 /*******************************************************************/
03198 static
03199 ib_int64_t
03200 btr_estimate_n_rows_in_range_on_level(
03201 /*==================================*/
03202   dict_index_t* index,      
03203   btr_path_t* slot1,      
03204   btr_path_t* slot2,      
03205   ib_int64_t  n_rows_on_prev_level, 
03210   ibool*    is_n_rows_exact)  
03213 {
03214   ulint   space;
03215   ib_int64_t  n_rows;
03216   ulint   n_pages_read;
03217   ulint   page_no;
03218   ulint   zip_size;
03219   ulint   level;
03220 
03221   space = dict_index_get_space(index);
03222 
03223   n_rows = 0;
03224   n_pages_read = 0;
03225 
03226   /* Assume by default that we will scan all pages between
03227   slot1->page_no and slot2->page_no */
03228   *is_n_rows_exact = TRUE;
03229 
03230   /* add records from slot1->page_no which are to the right of
03231   the record which serves as a left border of the range, if any */
03232   if (slot1->nth_rec < slot1->n_recs) {
03233     n_rows += slot1->n_recs - slot1->nth_rec;
03234   }
03235 
03236   /* add records from slot2->page_no which are to the left of
03237   the record which servers as a right border of the range, if any */
03238   if (slot2->nth_rec > 1) {
03239     n_rows += slot2->nth_rec - 1;
03240   }
03241 
03242   /* count the records in the pages between slot1->page_no and
03243   slot2->page_no (non inclusive), if any */
03244 
03245   zip_size = fil_space_get_zip_size(space);
03246 
03247   /* Do not read more than this number of pages in order not to hurt
03248   performance with this code which is just an estimation. If we read
03249   this many pages before reaching slot2->page_no then we estimate the
03250   average from the pages scanned so far */
03251 # define N_PAGES_READ_LIMIT 10
03252 
03253   page_no = slot1->page_no;
03254   level = slot1->page_level;
03255 
03256   do {
03257     mtr_t   mtr;
03258     page_t*   page;
03259     buf_block_t*  block;
03260 
03261     mtr_start(&mtr);
03262 
03263     /* fetch the page */
03264     block = buf_page_get(space, zip_size, page_no, RW_S_LATCH,
03265              &mtr);
03266 
03267     page = buf_block_get_frame(block);
03268 
03269     /* It is possible that the tree has been reorganized in the
03270     meantime and this is a different page. If this happens the
03271     calculated estimate will be bogus, which is not fatal as
03272     this is only an estimate. We are sure that a page with
03273     page_no exists because InnoDB never frees pages, only
03274     reuses them. */
03275     if (fil_page_get_type(page) != FIL_PAGE_INDEX
03276         || btr_page_get_index_id(page) != index->id
03277         || btr_page_get_level_low(page) != level) {
03278 
03279       /* The page got reused for something else */
03280       mtr_commit(&mtr);
03281       goto inexact;
03282     }
03283 
03284     n_pages_read++;
03285 
03286     if (page_no != slot1->page_no) {
03287       /* Do not count the records on slot1->page_no,
03288       we already counted them before this loop. */
03289       n_rows += page_get_n_recs(page);
03290     }
03291 
03292     page_no = btr_page_get_next(page, &mtr);
03293 
03294     mtr_commit(&mtr);
03295 
03296     if (n_pages_read == N_PAGES_READ_LIMIT
03297         || page_no == FIL_NULL) {
03298       /* Either we read too many pages or
03299       we reached the end of the level without passing
03300       through slot2->page_no, the tree must have changed
03301       in the meantime */
03302       goto inexact;
03303     }
03304 
03305   } while (page_no != slot2->page_no);
03306 
03307   return(n_rows);
03308 
03309 inexact:
03310 
03311   *is_n_rows_exact = FALSE;
03312 
03313   /* We did interrupt before reaching slot2->page */
03314 
03315   if (n_pages_read > 0) {
03316     /* The number of pages on this level is
03317     n_rows_on_prev_level, multiply it by the
03318     average number of recs per page so far */
03319     n_rows = n_rows_on_prev_level
03320       * n_rows / n_pages_read;
03321   } else {
03322     /* The tree changed before we could even
03323     start with slot1->page_no */
03324     n_rows = 10;
03325   }
03326 
03327   return(n_rows);
03328 }
03329 
03330 /*******************************************************************/
03333 UNIV_INTERN
03334 ib_int64_t
03335 btr_estimate_n_rows_in_range(
03336 /*=========================*/
03337   dict_index_t* index,  
03338   const dtuple_t* tuple1, 
03339   ulint   mode1,  
03340   const dtuple_t* tuple2, 
03341   ulint   mode2)  
03342 {
03343   btr_path_t  path1[BTR_PATH_ARRAY_N_SLOTS];
03344   btr_path_t  path2[BTR_PATH_ARRAY_N_SLOTS];
03345   btr_cur_t cursor;
03346   btr_path_t* slot1;
03347   btr_path_t* slot2;
03348   ibool   diverged;
03349   ibool   diverged_lot;
03350   ulint   divergence_level;
03351   ib_int64_t  n_rows;
03352   ibool   is_n_rows_exact;
03353   ulint   i;
03354   mtr_t   mtr;
03355 
03356   mtr_start(&mtr);
03357 
03358   cursor.path_arr = path1;
03359 
03360   if (dtuple_get_n_fields(tuple1) > 0) {
03361 
03362     btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
03363               BTR_SEARCH_LEAF | BTR_ESTIMATE,
03364               &cursor, 0,
03365               __FILE__, __LINE__, &mtr);
03366   } else {
03367     btr_cur_open_at_index_side(TRUE, index,
03368              BTR_SEARCH_LEAF | BTR_ESTIMATE,
03369              &cursor, &mtr);
03370   }
03371 
03372   mtr_commit(&mtr);
03373 
03374   mtr_start(&mtr);
03375 
03376   cursor.path_arr = path2;
03377 
03378   if (dtuple_get_n_fields(tuple2) > 0) {
03379 
03380     btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
03381               BTR_SEARCH_LEAF | BTR_ESTIMATE,
03382               &cursor, 0,
03383               __FILE__, __LINE__, &mtr);
03384   } else {
03385     btr_cur_open_at_index_side(FALSE, index,
03386              BTR_SEARCH_LEAF | BTR_ESTIMATE,
03387              &cursor, &mtr);
03388   }
03389 
03390   mtr_commit(&mtr);
03391 
03392   /* We have the path information for the range in path1 and path2 */
03393 
03394   n_rows = 1;
03395   is_n_rows_exact = TRUE;
03396   diverged = FALSE;     /* This becomes true when the path is not
03397             the same any more */
03398   diverged_lot = FALSE;     /* This becomes true when the paths are
03399             not the same or adjacent any more */
03400   divergence_level = 1000000; /* This is the level where paths diverged
03401             a lot */
03402   for (i = 0; ; i++) {
03403     ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
03404 
03405     slot1 = path1 + i;
03406     slot2 = path2 + i;
03407 
03408     if (slot1->nth_rec == ULINT_UNDEFINED
03409         || slot2->nth_rec == ULINT_UNDEFINED) {
03410 
03411       if (i > divergence_level + 1 && !is_n_rows_exact) {
03412         /* In trees whose height is > 1 our algorithm
03413         tends to underestimate: multiply the estimate
03414         by 2: */
03415 
03416         n_rows = n_rows * 2;
03417       }
03418 
03419       /* Do not estimate the number of rows in the range
03420       to over 1 / 2 of the estimated rows in the whole
03421       table */
03422 
03423       if (n_rows > index->table->stat_n_rows / 2
03424           && !is_n_rows_exact) {
03425 
03426         n_rows = index->table->stat_n_rows / 2;
03427 
03428         /* If there are just 0 or 1 rows in the table,
03429         then we estimate all rows are in the range */
03430 
03431         if (n_rows == 0) {
03432           n_rows = index->table->stat_n_rows;
03433         }
03434       }
03435 
03436       return(n_rows);
03437     }
03438 
03439     if (!diverged && slot1->nth_rec != slot2->nth_rec) {
03440 
03441       diverged = TRUE;
03442 
03443       if (slot1->nth_rec < slot2->nth_rec) {
03444         n_rows = slot2->nth_rec - slot1->nth_rec;
03445 
03446         if (n_rows > 1) {
03447           diverged_lot = TRUE;
03448           divergence_level = i;
03449         }
03450       } else {
03451         /* It is possible that
03452         slot1->nth_rec >= slot2->nth_rec
03453         if, for example, we have a single page
03454         tree which contains (inf, 5, 6, supr)
03455         and we select where x > 20 and x < 30;
03456         in this case slot1->nth_rec will point
03457         to the supr record and slot2->nth_rec
03458         will point to 6 */
03459         n_rows = 0;
03460       }
03461 
03462     } else if (diverged && !diverged_lot) {
03463 
03464       if (slot1->nth_rec < slot1->n_recs
03465           || slot2->nth_rec > 1) {
03466 
03467         diverged_lot = TRUE;
03468         divergence_level = i;
03469 
03470         n_rows = 0;
03471 
03472         if (slot1->nth_rec < slot1->n_recs) {
03473           n_rows += slot1->n_recs
03474             - slot1->nth_rec;
03475         }
03476 
03477         if (slot2->nth_rec > 1) {
03478           n_rows += slot2->nth_rec - 1;
03479         }
03480       }
03481     } else if (diverged_lot) {
03482 
03483       n_rows = btr_estimate_n_rows_in_range_on_level(
03484         index, slot1, slot2, n_rows,
03485         &is_n_rows_exact);
03486     }
03487   }
03488 }
03489 
03490 /*******************************************************************/
03494 UNIV_INTERN
03495 void
03496 btr_estimate_number_of_different_key_vals(
03497 /*======================================*/
03498   dict_index_t* index)  
03499 {
03500   btr_cur_t cursor;
03501   page_t*   page;
03502   rec_t*    rec;
03503   ulint   n_cols;
03504   ulint   matched_fields;
03505   ulint   matched_bytes;
03506   ib_int64_t* n_diff;
03507   ullint    n_sample_pages; /* number of pages to sample */
03508   ulint   not_empty_flag  = 0;
03509   ulint   total_external_size = 0;
03510   ulint   i;
03511   ulint   j;
03512   ullint    add_on;
03513   mtr_t   mtr;
03514   mem_heap_t* heap    = NULL;
03515   ulint   offsets_rec_[REC_OFFS_NORMAL_SIZE];
03516   ulint   offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
03517   ulint*    offsets_rec = offsets_rec_;
03518   ulint*    offsets_next_rec= offsets_next_rec_;
03519   rec_offs_init(offsets_rec_);
03520   rec_offs_init(offsets_next_rec_);
03521 
03522   n_cols = dict_index_get_n_unique(index);
03523 
03524   n_diff = (ib_int64_t *)mem_zalloc((n_cols + 1) * sizeof(ib_int64_t));
03525 
03526   /* It makes no sense to test more pages than are contained
03527   in the index, thus we lower the number if it is too high */
03528   if (srv_stats_sample_pages > index->stat_index_size) {
03529     if (index->stat_index_size > 0) {
03530       n_sample_pages = index->stat_index_size;
03531     } else {
03532       n_sample_pages = 1;
03533     }
03534   } else {
03535     n_sample_pages = srv_stats_sample_pages;
03536   }
03537 
03538   /* We sample some pages in the index to get an estimate */
03539 
03540   for (i = 0; i < n_sample_pages; i++) {
03541     rec_t*  supremum;
03542     mtr_start(&mtr);
03543 
03544     btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
03545 
03546     /* Count the number of different key values for each prefix of
03547     the key on this index page. If the prefix does not determine
03548     the index record uniquely in the B-tree, then we subtract one
03549     because otherwise our algorithm would give a wrong estimate
03550     for an index where there is just one key value. */
03551 
03552     page = btr_cur_get_page(&cursor);
03553 
03554     supremum = page_get_supremum_rec(page);
03555     rec = page_rec_get_next(page_get_infimum_rec(page));
03556 
03557     if (rec != supremum) {
03558       not_empty_flag = 1;
03559       offsets_rec = rec_get_offsets(rec, index, offsets_rec,
03560                   ULINT_UNDEFINED, &heap);
03561     }
03562 
03563     while (rec != supremum) {
03564       rec_t*  next_rec = page_rec_get_next(rec);
03565       if (next_rec == supremum) {
03566         break;
03567       }
03568 
03569       matched_fields = 0;
03570       matched_bytes = 0;
03571       offsets_next_rec = rec_get_offsets(next_rec, index,
03572                  offsets_next_rec,
03573                  n_cols, &heap);
03574 
03575       cmp_rec_rec_with_match(rec, next_rec,
03576                  offsets_rec, offsets_next_rec,
03577                  index, &matched_fields,
03578                  &matched_bytes);
03579 
03580       for (j = matched_fields + 1; j <= n_cols; j++) {
03581         /* We add one if this index record has
03582         a different prefix from the previous */
03583 
03584         n_diff[j]++;
03585       }
03586 
03587       total_external_size
03588         += btr_rec_get_externally_stored_len(
03589           rec, offsets_rec);
03590 
03591       rec = next_rec;
03592       /* Initialize offsets_rec for the next round
03593       and assign the old offsets_rec buffer to
03594       offsets_next_rec. */
03595       {
03596         ulint*  offsets_tmp = offsets_rec;
03597         offsets_rec = offsets_next_rec;
03598         offsets_next_rec = offsets_tmp;
03599       }
03600     }
03601 
03602 
03603     if (n_cols == dict_index_get_n_unique_in_tree(index)) {
03604 
03605       /* If there is more than one leaf page in the tree,
03606       we add one because we know that the first record
03607       on the page certainly had a different prefix than the
03608       last record on the previous index page in the
03609       alphabetical order. Before this fix, if there was
03610       just one big record on each clustered index page, the
03611       algorithm grossly underestimated the number of rows
03612       in the table. */
03613 
03614       if (btr_page_get_prev(page, &mtr) != FIL_NULL
03615           || btr_page_get_next(page, &mtr) != FIL_NULL) {
03616 
03617         n_diff[n_cols]++;
03618       }
03619     }
03620 
03621     offsets_rec = rec_get_offsets(rec, index, offsets_rec,
03622                 ULINT_UNDEFINED, &heap);
03623     total_external_size += btr_rec_get_externally_stored_len(
03624       rec, offsets_rec);
03625     mtr_commit(&mtr);
03626   }
03627 
03628   /* If we saw k borders between different key values on
03629   n_sample_pages leaf pages, we can estimate how many
03630   there will be in index->stat_n_leaf_pages */
03631 
03632   /* We must take into account that our sample actually represents
03633   also the pages used for external storage of fields (those pages are
03634   included in index->stat_n_leaf_pages) */
03635 
03636   for (j = 0; j <= n_cols; j++) {
03637     index->stat_n_diff_key_vals[j]
03638       = ((n_diff[j]
03639           * (ib_int64_t)index->stat_n_leaf_pages
03640           + n_sample_pages - 1
03641           + total_external_size
03642           + not_empty_flag)
03643          / (n_sample_pages
03644             + total_external_size));
03645 
03646     /* If the tree is small, smaller than
03647     10 * n_sample_pages + total_external_size, then
03648     the above estimate is ok. For bigger trees it is common that we
03649     do not see any borders between key values in the few pages
03650     we pick. But still there may be n_sample_pages
03651     different key values, or even more. Let us try to approximate
03652     that: */
03653 
03654     add_on = index->stat_n_leaf_pages
03655       / (10 * (n_sample_pages
03656          + total_external_size));
03657 
03658     if (add_on > n_sample_pages) {
03659       add_on = n_sample_pages;
03660     }
03661 
03662     index->stat_n_diff_key_vals[j] += add_on;
03663   }
03664 
03665   mem_free(n_diff);
03666   if (UNIV_LIKELY_NULL(heap)) {
03667     mem_heap_free(heap);
03668   }
03669 }
03670 
03671 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
03672 
03673 /***********************************************************/
03676 static
03677 ulint
03678 btr_rec_get_externally_stored_len(
03679 /*==============================*/
03680   rec_t*    rec,  
03681   const ulint*  offsets)
03682 {
03683   ulint n_fields;
03684   byte* data;
03685   ulint local_len;
03686   ulint extern_len;
03687   ulint total_extern_len = 0;
03688   ulint i;
03689 
03690   ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
03691   n_fields = rec_offs_n_fields(offsets);
03692 
03693   for (i = 0; i < n_fields; i++) {
03694     if (rec_offs_nth_extern(offsets, i)) {
03695 
03696       data = rec_get_nth_field(rec, offsets, i, &local_len);
03697 
03698       local_len -= BTR_EXTERN_FIELD_REF_SIZE;
03699 
03700       extern_len = mach_read_from_4(data + local_len
03701                   + BTR_EXTERN_LEN + 4);
03702 
03703       total_extern_len += ut_calc_align(extern_len,
03704                 UNIV_PAGE_SIZE);
03705     }
03706   }
03707 
03708   return(total_extern_len / UNIV_PAGE_SIZE);
03709 }
03710 
03711 /*******************************************************************/
03713 static
03714 void
03715 btr_cur_set_ownership_of_extern_field(
03716 /*==================================*/
03717   page_zip_des_t* page_zip,
03719   rec_t*    rec,  
03720   dict_index_t* index,  
03721   const ulint*  offsets,
03722   ulint   i,  
03723   ibool   val,  
03724   mtr_t*    mtr)  
03725 {
03726   byte* data;
03727   ulint local_len;
03728   ulint byte_val;
03729 
03730   data = rec_get_nth_field(rec, offsets, i, &local_len);
03731 
03732   ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
03733 
03734   local_len -= BTR_EXTERN_FIELD_REF_SIZE;
03735 
03736   byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
03737 
03738   if (val) {
03739     byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
03740   } else {
03741     byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
03742   }
03743 
03744   if (UNIV_LIKELY_NULL(page_zip)) {
03745     mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
03746     page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
03747   } else if (UNIV_LIKELY(mtr != NULL)) {
03748 
03749     mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
03750          MLOG_1BYTE, mtr);
03751   } else {
03752     mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
03753   }
03754 }
03755 
03756 /*******************************************************************/
03762 UNIV_INTERN
03763 ibool
03764 btr_cur_mark_extern_inherited_fields(
03765 /*=================================*/
03766   page_zip_des_t* page_zip,
03768   rec_t*    rec,  
03769   dict_index_t* index,  
03770   const ulint*  offsets,
03771   const upd_t*  update, 
03772   mtr_t*    mtr)  
03773 {
03774   ulint n;
03775   ulint j;
03776   ulint i;
03777   ibool change_ownership = FALSE;
03778 
03779   ut_ad(rec_offs_validate(rec, NULL, offsets));
03780   ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
03781 
03782   if (!rec_offs_any_extern(offsets)) {
03783 
03784     return(FALSE);
03785   }
03786 
03787   n = rec_offs_n_fields(offsets);
03788 
03789   for (i = 0; i < n; i++) {
03790     if (rec_offs_nth_extern(offsets, i)) {
03791 
03792       /* Check it is not in updated fields */
03793 
03794       if (update) {
03795         for (j = 0; j < upd_get_n_fields(update);
03796              j++) {
03797           if (upd_get_nth_field(update, j)
03798               ->field_no == i) {
03799 
03800             goto updated;
03801           }
03802         }
03803       }
03804 
03805       btr_cur_set_ownership_of_extern_field(
03806         page_zip, rec, index, offsets, i, FALSE, mtr);
03807 
03808       change_ownership = TRUE;
03809 updated:
03810       ;
03811     }
03812   }
03813 
03814   return(change_ownership);
03815 }
03816 
03817 /*******************************************************************/
03821 UNIV_INTERN
03822 void
03823 btr_cur_mark_dtuple_inherited_extern(
03824 /*=================================*/
03825   dtuple_t* entry,    
03827   const upd_t*  update)   
03828 {
03829   ulint   i;
03830 
03831   for (i = 0; i < dtuple_get_n_fields(entry); i++) {
03832 
03833     dfield_t* dfield = dtuple_get_nth_field(entry, i);
03834     byte*   data;
03835     ulint   len;
03836     ulint   j;
03837 
03838     if (!dfield_is_ext(dfield)) {
03839       continue;
03840     }
03841 
03842     /* Check if it is in updated fields */
03843 
03844     for (j = 0; j < upd_get_n_fields(update); j++) {
03845       if (upd_get_nth_field(update, j)->field_no == i) {
03846 
03847         goto is_updated;
03848       }
03849     }
03850 
03851     data = (unsigned char *)dfield_get_data(dfield);
03852     len = dfield_get_len(dfield);
03853     data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
03854       |= BTR_EXTERN_INHERITED_FLAG;
03855 
03856 is_updated:
03857     ;
03858   }
03859 }
03860 
03861 /*******************************************************************/
03865 static
03866 void
03867 btr_cur_unmark_extern_fields(
03868 /*=========================*/
03869   page_zip_des_t* page_zip,
03871   rec_t*    rec,  
03872   dict_index_t* index,  
03873   const ulint*  offsets,
03874   mtr_t*    mtr)  
03875 {
03876   ulint n;
03877   ulint i;
03878 
03879   ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
03880   n = rec_offs_n_fields(offsets);
03881 
03882   if (!rec_offs_any_extern(offsets)) {
03883 
03884     return;
03885   }
03886 
03887   for (i = 0; i < n; i++) {
03888     if (rec_offs_nth_extern(offsets, i)) {
03889 
03890       btr_cur_set_ownership_of_extern_field(
03891         page_zip, rec, index, offsets, i, TRUE, mtr);
03892     }
03893   }
03894 }
03895 
03896 /*******************************************************************/
03898 UNIV_INTERN
03899 void
03900 btr_cur_unmark_dtuple_extern_fields(
03901 /*================================*/
03902   dtuple_t* entry)    
03903 {
03904   ulint i;
03905 
03906   for (i = 0; i < dtuple_get_n_fields(entry); i++) {
03907     dfield_t* dfield = dtuple_get_nth_field(entry, i);
03908 
03909     if (dfield_is_ext(dfield)) {
03910       byte* data = (unsigned char *)dfield_get_data(dfield);
03911       ulint len = dfield_get_len(dfield);
03912 
03913       data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
03914         &= ~BTR_EXTERN_OWNER_FLAG;
03915     }
03916   }
03917 }
03918 
03919 /*******************************************************************/
03924 UNIV_INTERN
03925 ulint
03926 btr_push_update_extern_fields(
03927 /*==========================*/
03928   dtuple_t* tuple,  
03929   const upd_t*  update, 
03930   mem_heap_t* heap) 
03931 {
03932   ulint     n_pushed  = 0;
03933   ulint     n;
03934   const upd_field_t*  uf;
03935 
03936   ut_ad(tuple);
03937   ut_ad(update);
03938 
03939   uf = update->fields;
03940   n = upd_get_n_fields(update);
03941 
03942   for (; n--; uf++) {
03943     if (dfield_is_ext(&uf->new_val)) {
03944       dfield_t* field
03945         = dtuple_get_nth_field(tuple, uf->field_no);
03946 
03947       if (!dfield_is_ext(field)) {
03948         dfield_set_ext(field);
03949         n_pushed++;
03950       }
03951 
03952       switch (uf->orig_len) {
03953         byte* data;
03954         ulint len;
03955         byte* buf;
03956       case 0:
03957         break;
03958       case BTR_EXTERN_FIELD_REF_SIZE:
03959         /* Restore the original locally stored
03960         part of the column.  In the undo log,
03961         InnoDB writes a longer prefix of externally
03962         stored columns, so that column prefixes
03963         in secondary indexes can be reconstructed. */
03964         dfield_set_data(field, (byte*) dfield_get_data(field)
03965             + dfield_get_len(field)
03966             - BTR_EXTERN_FIELD_REF_SIZE,
03967             BTR_EXTERN_FIELD_REF_SIZE);
03968         dfield_set_ext(field);
03969         break;
03970       default:
03971         /* Reconstruct the original locally
03972         stored part of the column.  The data
03973         will have to be copied. */
03974         ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
03975 
03976         data = (unsigned char *)dfield_get_data(field);
03977         len = dfield_get_len(field);
03978 
03979         buf = (unsigned char *)mem_heap_alloc(heap, uf->orig_len);
03980         /* Copy the locally stored prefix. */
03981         memcpy(buf, data,
03982                uf->orig_len
03983                - BTR_EXTERN_FIELD_REF_SIZE);
03984         /* Copy the BLOB pointer. */
03985         memcpy(buf + uf->orig_len
03986                - BTR_EXTERN_FIELD_REF_SIZE,
03987                data + len - BTR_EXTERN_FIELD_REF_SIZE,
03988                BTR_EXTERN_FIELD_REF_SIZE);
03989 
03990         dfield_set_data(field, buf, uf->orig_len);
03991         dfield_set_ext(field);
03992       }
03993     }
03994   }
03995 
03996   return(n_pushed);
03997 }
03998 
03999 /*******************************************************************/
04002 static
04003 ulint
04004 btr_blob_get_part_len(
04005 /*==================*/
04006   const byte* blob_header)  
04007 {
04008   return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
04009 }
04010 
04011 /*******************************************************************/
04014 static
04015 ulint
04016 btr_blob_get_next_page_no(
04017 /*======================*/
04018   const byte* blob_header)  
04019 {
04020   return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
04021 }
04022 
04023 /*******************************************************************/
04025 static
04026 void
04027 btr_blob_free(
04028 /*==========*/
04029   buf_block_t*  block,  
04030   ibool   all,  
04032   mtr_t*    mtr)  
04033 {
04034   buf_pool_t* buf_pool = buf_pool_from_block(block);
04035   ulint   space = buf_block_get_space(block);
04036   ulint   page_no = buf_block_get_page_no(block);
04037 
04038   ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
04039 
04040   mtr_commit(mtr);
04041 
04042   buf_pool_mutex_enter(buf_pool);
04043   mutex_enter(&block->mutex);
04044 
04045   /* Only free the block if it is still allocated to
04046   the same file page. */
04047 
04048   if (buf_block_get_state(block)
04049       == BUF_BLOCK_FILE_PAGE
04050       && buf_block_get_space(block) == space
04051       && buf_block_get_page_no(block) == page_no) {
04052 
04053     if (buf_LRU_free_block(&block->page, all, NULL)
04054         != BUF_LRU_FREED
04055         && all && block->page.zip.data) {
04056       /* Attempt to deallocate the uncompressed page
04057       if the whole block cannot be deallocted. */
04058 
04059       buf_LRU_free_block(&block->page, FALSE, NULL);
04060     }
04061   }
04062 
04063   buf_pool_mutex_exit(buf_pool);
04064   mutex_exit(&block->mutex);
04065 }
04066 
04067 /*******************************************************************/
04073 UNIV_INTERN
04074 ulint
04075 btr_store_big_rec_extern_fields(
04076 /*============================*/
04077   dict_index_t* index,    
04079   buf_block_t*  rec_block,  
04080   rec_t*    rec,    
04081   const ulint*  offsets,  
04085   big_rec_t*  big_rec_vec,  
04087   mtr_t*    /*local_mtr __attribute__((unused))*/) 
04090 {
04091   ulint rec_page_no;
04092   byte* field_ref;
04093   ulint extern_len;
04094   ulint store_len;
04095   ulint page_no;
04096   ulint space_id;
04097   ulint zip_size;
04098   ulint prev_page_no;
04099   ulint hint_page_no;
04100   ulint i;
04101   mtr_t mtr;
04102   mem_heap_t* heap = NULL;
04103   page_zip_des_t* page_zip;
04104   z_stream c_stream;
04105 
04106   ut_ad(rec_offs_validate(rec, index, offsets));
04107   ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
04108         MTR_MEMO_X_LOCK));
04109   ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
04110   ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
04111   ut_a(dict_index_is_clust(index));
04112 
04113   page_zip = buf_block_get_page_zip(rec_block);
04114   ut_a(dict_table_zip_size(index->table)
04115        == buf_block_get_zip_size(rec_block));
04116 
04117   space_id = buf_block_get_space(rec_block);
04118   zip_size = buf_block_get_zip_size(rec_block);
04119   rec_page_no = buf_block_get_page_no(rec_block);
04120   ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
04121 
04122   if (UNIV_LIKELY_NULL(page_zip)) {
04123     int err;
04124 
04125     /* Zlib deflate needs 128 kilobytes for the default
04126     window size, plus 512 << memLevel, plus a few
04127     kilobytes for small objects.  We use reduced memLevel
04128     to limit the memory consumption, and preallocate the
04129     heap, hoping to avoid memory fragmentation. */
04130     heap = mem_heap_create(250000);
04131     page_zip_set_alloc(&c_stream, heap);
04132 
04133     err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
04134            Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
04135     ut_a(err == Z_OK);
04136   }
04137 
04138   /* We have to create a file segment to the tablespace
04139   for each field and put the pointer to the field in rec */
04140 
04141   for (i = 0; i < big_rec_vec->n_fields; i++) {
04142     ut_ad(rec_offs_nth_extern(offsets,
04143             big_rec_vec->fields[i].field_no));
04144     {
04145       ulint local_len;
04146       field_ref = rec_get_nth_field(
04147         rec, offsets, big_rec_vec->fields[i].field_no,
04148         &local_len);
04149       ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
04150       local_len -= BTR_EXTERN_FIELD_REF_SIZE;
04151       field_ref += local_len;
04152     }
04153     extern_len = big_rec_vec->fields[i].len;
04154     UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
04155            extern_len);
04156 
04157     ut_a(extern_len > 0);
04158 
04159     prev_page_no = FIL_NULL;
04160 
04161     if (UNIV_LIKELY_NULL(page_zip)) {
04162       int err = deflateReset(&c_stream);
04163       ut_a(err == Z_OK);
04164 
04165       c_stream.next_in = (Bytef *) big_rec_vec->fields[i].data;
04166       c_stream.avail_in = extern_len;
04167     }
04168 
04169     for (;;) {
04170       buf_block_t*  block;
04171       page_t*   page;
04172 
04173       mtr_start(&mtr);
04174 
04175       if (prev_page_no == FIL_NULL) {
04176         hint_page_no = 1 + rec_page_no;
04177       } else {
04178         hint_page_no = prev_page_no + 1;
04179       }
04180 
04181       block = btr_page_alloc(index, hint_page_no,
04182                  FSP_NO_DIR, 0, &mtr);
04183       if (UNIV_UNLIKELY(block == NULL)) {
04184 
04185         mtr_commit(&mtr);
04186 
04187         if (UNIV_LIKELY_NULL(page_zip)) {
04188           deflateEnd(&c_stream);
04189           mem_heap_free(heap);
04190         }
04191 
04192         return(DB_OUT_OF_FILE_SPACE);
04193       }
04194 
04195       page_no = buf_block_get_page_no(block);
04196       page = buf_block_get_frame(block);
04197 
04198       if (prev_page_no != FIL_NULL) {
04199         buf_block_t*  prev_block;
04200         page_t*   prev_page;
04201 
04202         prev_block = buf_page_get(space_id, zip_size,
04203                 prev_page_no,
04204                 RW_X_LATCH, &mtr);
04205         buf_block_dbg_add_level(prev_block,
04206               SYNC_EXTERN_STORAGE);
04207         prev_page = buf_block_get_frame(prev_block);
04208 
04209         if (UNIV_LIKELY_NULL(page_zip)) {
04210           mlog_write_ulint(
04211             prev_page + FIL_PAGE_NEXT,
04212             page_no, MLOG_4BYTES, &mtr);
04213           memcpy(buf_block_get_page_zip(
04214                    prev_block)
04215                  ->data + FIL_PAGE_NEXT,
04216                  prev_page + FIL_PAGE_NEXT, 4);
04217         } else {
04218           mlog_write_ulint(
04219             prev_page + FIL_PAGE_DATA
04220             + BTR_BLOB_HDR_NEXT_PAGE_NO,
04221             page_no, MLOG_4BYTES, &mtr);
04222         }
04223 
04224       }
04225 
04226       if (UNIV_LIKELY_NULL(page_zip)) {
04227         int   err;
04228         page_zip_des_t* blob_page_zip;
04229 
04230         /* Write FIL_PAGE_TYPE to the redo log
04231         separately, before logging any other
04232         changes to the page, so that the debug
04233         assertions in
04234         recv_parse_or_apply_log_rec_body() can
04235         be made simpler.  Before InnoDB Plugin
04236         1.0.4, the initialization of
04237         FIL_PAGE_TYPE was logged as part of
04238         the mlog_log_string() below. */
04239 
04240         mlog_write_ulint(page + FIL_PAGE_TYPE,
04241              prev_page_no == FIL_NULL
04242              ? FIL_PAGE_TYPE_ZBLOB
04243              : FIL_PAGE_TYPE_ZBLOB2,
04244              MLOG_2BYTES, &mtr);
04245 
04246         c_stream.next_out = page
04247           + FIL_PAGE_DATA;
04248         c_stream.avail_out
04249           = page_zip_get_size(page_zip)
04250           - FIL_PAGE_DATA;
04251 
04252         err = deflate(&c_stream, Z_FINISH);
04253         ut_a(err == Z_OK || err == Z_STREAM_END);
04254         ut_a(err == Z_STREAM_END
04255              || c_stream.avail_out == 0);
04256 
04257         /* Write the "next BLOB page" pointer */
04258         mlog_write_ulint(page + FIL_PAGE_NEXT,
04259              FIL_NULL, MLOG_4BYTES, &mtr);
04260         /* Initialize the unused "prev page" pointer */
04261         mlog_write_ulint(page + FIL_PAGE_PREV,
04262              FIL_NULL, MLOG_4BYTES, &mtr);
04263         /* Write a back pointer to the record
04264         into the otherwise unused area.  This
04265         information could be useful in
04266         debugging.  Later, we might want to
04267         implement the possibility to relocate
04268         BLOB pages.  Then, we would need to be
04269         able to adjust the BLOB pointer in the
04270         record.  We do not store the heap
04271         number of the record, because it can
04272         change in page_zip_reorganize() or
04273         btr_page_reorganize().  However, also
04274         the page number of the record may
04275         change when B-tree nodes are split or
04276         merged. */
04277         mlog_write_ulint(page
04278              + FIL_PAGE_FILE_FLUSH_LSN,
04279              space_id,
04280              MLOG_4BYTES, &mtr);
04281         mlog_write_ulint(page
04282              + FIL_PAGE_FILE_FLUSH_LSN + 4,
04283              rec_page_no,
04284              MLOG_4BYTES, &mtr);
04285 
04286         /* Zero out the unused part of the page. */
04287         memset(page + page_zip_get_size(page_zip)
04288                - c_stream.avail_out,
04289                0, c_stream.avail_out);
04290         mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
04291             page_zip_get_size(page_zip)
04292             - FIL_PAGE_FILE_FLUSH_LSN,
04293             &mtr);
04294         /* Copy the page to compressed storage,
04295         because it will be flushed to disk
04296         from there. */
04297         blob_page_zip = buf_block_get_page_zip(block);
04298         ut_ad(blob_page_zip);
04299         ut_ad(page_zip_get_size(blob_page_zip)
04300               == page_zip_get_size(page_zip));
04301         memcpy(blob_page_zip->data, page,
04302                page_zip_get_size(page_zip));
04303 
04304         if (err == Z_OK && prev_page_no != FIL_NULL) {
04305 
04306           goto next_zip_page;
04307         }
04308 
04309         rec_block = buf_page_get(space_id, zip_size,
04310                rec_page_no,
04311                RW_X_LATCH, &mtr);
04312         buf_block_dbg_add_level(rec_block,
04313               SYNC_NO_ORDER_CHECK);
04314 
04315         if (err == Z_STREAM_END) {
04316           mach_write_to_4(field_ref
04317               + BTR_EXTERN_LEN, 0);
04318           mach_write_to_4(field_ref
04319               + BTR_EXTERN_LEN + 4,
04320               c_stream.total_in);
04321         } else {
04322           memset(field_ref + BTR_EXTERN_LEN,
04323                  0, 8);
04324         }
04325 
04326         if (prev_page_no == FIL_NULL) {
04327           mach_write_to_4(field_ref
04328               + BTR_EXTERN_SPACE_ID,
04329               space_id);
04330 
04331           mach_write_to_4(field_ref
04332               + BTR_EXTERN_PAGE_NO,
04333               page_no);
04334 
04335           mach_write_to_4(field_ref
04336               + BTR_EXTERN_OFFSET,
04337               FIL_PAGE_NEXT);
04338         }
04339 
04340         page_zip_write_blob_ptr(
04341           page_zip, rec, index, offsets,
04342           big_rec_vec->fields[i].field_no, &mtr);
04343 
04344 next_zip_page:
04345         prev_page_no = page_no;
04346 
04347         /* Commit mtr and release the
04348         uncompressed page frame to save memory. */
04349         btr_blob_free(block, FALSE, &mtr);
04350 
04351         if (err == Z_STREAM_END) {
04352           break;
04353         }
04354       } else {
04355         mlog_write_ulint(page + FIL_PAGE_TYPE,
04356              FIL_PAGE_TYPE_BLOB,
04357              MLOG_2BYTES, &mtr);
04358 
04359         if (extern_len > (UNIV_PAGE_SIZE
04360               - FIL_PAGE_DATA
04361               - BTR_BLOB_HDR_SIZE
04362               - FIL_PAGE_DATA_END)) {
04363           store_len = UNIV_PAGE_SIZE
04364             - FIL_PAGE_DATA
04365             - BTR_BLOB_HDR_SIZE
04366             - FIL_PAGE_DATA_END;
04367         } else {
04368           store_len = extern_len;
04369         }
04370 
04371         mlog_write_string(page + FIL_PAGE_DATA
04372               + BTR_BLOB_HDR_SIZE,
04373               (const byte*)
04374               big_rec_vec->fields[i].data
04375               + big_rec_vec->fields[i].len
04376               - extern_len,
04377               store_len, &mtr);
04378         mlog_write_ulint(page + FIL_PAGE_DATA
04379              + BTR_BLOB_HDR_PART_LEN,
04380              store_len, MLOG_4BYTES, &mtr);
04381         mlog_write_ulint(page + FIL_PAGE_DATA
04382              + BTR_BLOB_HDR_NEXT_PAGE_NO,
04383              FIL_NULL, MLOG_4BYTES, &mtr);
04384 
04385         extern_len -= store_len;
04386 
04387         rec_block = buf_page_get(space_id, zip_size,
04388                rec_page_no,
04389                RW_X_LATCH, &mtr);
04390         buf_block_dbg_add_level(rec_block,
04391               SYNC_NO_ORDER_CHECK);
04392 
04393         mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
04394              MLOG_4BYTES, &mtr);
04395         mlog_write_ulint(field_ref
04396              + BTR_EXTERN_LEN + 4,
04397              big_rec_vec->fields[i].len
04398              - extern_len,
04399              MLOG_4BYTES, &mtr);
04400 
04401         if (prev_page_no == FIL_NULL) {
04402           mlog_write_ulint(field_ref
04403                + BTR_EXTERN_SPACE_ID,
04404                space_id,
04405                MLOG_4BYTES, &mtr);
04406 
04407           mlog_write_ulint(field_ref
04408                + BTR_EXTERN_PAGE_NO,
04409                page_no,
04410                MLOG_4BYTES, &mtr);
04411 
04412           mlog_write_ulint(field_ref
04413                + BTR_EXTERN_OFFSET,
04414                FIL_PAGE_DATA,
04415                MLOG_4BYTES, &mtr);
04416         }
04417 
04418         prev_page_no = page_no;
04419 
04420         mtr_commit(&mtr);
04421 
04422         if (extern_len == 0) {
04423           break;
04424         }
04425       }
04426     }
04427   }
04428 
04429   if (UNIV_LIKELY_NULL(page_zip)) {
04430     deflateEnd(&c_stream);
04431     mem_heap_free(heap);
04432   }
04433 
04434   return(DB_SUCCESS);
04435 }
04436 
04437 /*******************************************************************/
04439 static
04440 void
04441 btr_check_blob_fil_page_type(
04442 /*=========================*/
04443   ulint   space_id, 
04444   ulint   page_no,  
04445   const page_t* page,   
04446   ibool   read)   
04447 {
04448   ulint type = fil_page_get_type(page);
04449 
04450   ut_a(space_id == page_get_space_id(page));
04451   ut_a(page_no == page_get_page_no(page));
04452 
04453   if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
04454     ulint flags = fil_space_get_flags(space_id);
04455 
04456     if (UNIV_LIKELY
04457         ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) {
04458       /* Old versions of InnoDB did not initialize
04459       FIL_PAGE_TYPE on BLOB pages.  Do not print
04460       anything about the type mismatch when reading
04461       a BLOB page that is in Antelope format.*/
04462       return;
04463     }
04464 
04465     ut_print_timestamp(stderr);
04466     fprintf(stderr,
04467       "  InnoDB: FIL_PAGE_TYPE=%lu"
04468       " on BLOB %s space %lu page %lu flags %lx\n",
04469       (ulong) type, read ? "read" : "purge",
04470       (ulong) space_id, (ulong) page_no, (ulong) flags);
04471     ut_error;
04472   }
04473 }
04474 
04475 /*******************************************************************/
04480 UNIV_INTERN
04481 void
04482 btr_free_externally_stored_field(
04483 /*=============================*/
04484   dict_index_t* index,    
04492   byte*   field_ref,  
04493   const rec_t*  rec,    
04495   const ulint*  offsets,  
04497   page_zip_des_t* page_zip, 
04499   ulint   i,    
04501   enum trx_rb_ctx rb_ctx,   
04502   mtr_t*    /*local_mtr __attribute__((unused))*/) 
04505 {
04506   page_t*   page;
04507   ulint   space_id;
04508   ulint   rec_zip_size = dict_table_zip_size(index->table);
04509   ulint   ext_zip_size;
04510   ulint   page_no;
04511   ulint   next_page_no;
04512   mtr_t   mtr;
04513 #ifdef UNIV_DEBUG
04514   ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
04515         MTR_MEMO_X_LOCK));
04516   ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
04517              MTR_MEMO_PAGE_X_FIX));
04518   ut_ad(!rec || rec_offs_validate(rec, index, offsets));
04519 
04520   if (rec) {
04521     ulint local_len;
04522     const byte* f = rec_get_nth_field(rec, offsets,
04523                   i, &local_len);
04524     ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
04525     local_len -= BTR_EXTERN_FIELD_REF_SIZE;
04526     f += local_len;
04527     ut_ad(f == field_ref);
04528   }
04529 #endif /* UNIV_DEBUG */
04530 
04531   if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
04532           BTR_EXTERN_FIELD_REF_SIZE))) {
04533     /* In the rollback of uncommitted transactions, we may
04534     encounter a clustered index record whose BLOBs have
04535     not been written.  There is nothing to free then. */
04536     ut_a(rb_ctx == RB_RECOVERY || rb_ctx == RB_RECOVERY_PURGE_REC);
04537     return;
04538   }
04539 
04540   space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID);
04541 
04542   if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
04543     ext_zip_size = fil_space_get_zip_size(space_id);
04544     /* This must be an undo log record in the system tablespace,
04545     that is, in row_purge_upd_exist_or_extern().
04546     Currently, externally stored records are stored in the
04547     same tablespace as the referring records. */
04548     ut_ad(!page_get_space_id(page_align(field_ref)));
04549     ut_ad(!rec);
04550     ut_ad(!page_zip);
04551   } else {
04552     ext_zip_size = rec_zip_size;
04553   }
04554 
04555   if (!rec) {
04556     /* This is a call from row_purge_upd_exist_or_extern(). */
04557     ut_ad(!page_zip);
04558     rec_zip_size = 0;
04559   }
04560 
04561   for (;;) {
04562 #ifdef UNIV_SYNC_DEBUG
04563     buf_block_t*  rec_block;
04564 #endif /* UNIV_SYNC_DEBUG */
04565     buf_block_t*  ext_block;
04566 
04567     mtr_start(&mtr);
04568 
04569 #ifdef UNIV_SYNC_DEBUG
04570     rec_block =
04571 #endif /* UNIV_SYNC_DEBUG */
04572       buf_page_get(page_get_space_id(
04573              page_align(field_ref)),
04574            rec_zip_size,
04575            page_get_page_no(
04576              page_align(field_ref)),
04577            RW_X_LATCH, &mtr);
04578     buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
04579     page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
04580 
04581     if (/* There is no external storage data */
04582         page_no == FIL_NULL
04583         /* This field does not own the externally stored field */
04584         || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
04585       & BTR_EXTERN_OWNER_FLAG)
04586         /* Rollback and inherited field */
04587         || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
04588       && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
04589           & BTR_EXTERN_INHERITED_FLAG))) {
04590 
04591       /* Do not free */
04592       mtr_commit(&mtr);
04593 
04594       return;
04595     }
04596 
04597     ext_block = buf_page_get(space_id, ext_zip_size, page_no,
04598            RW_X_LATCH, &mtr);
04599     buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
04600     page = buf_block_get_frame(ext_block);
04601 
04602     if (ext_zip_size) {
04603       /* Note that page_zip will be NULL
04604       in row_purge_upd_exist_or_extern(). */
04605       switch (fil_page_get_type(page)) {
04606       case FIL_PAGE_TYPE_ZBLOB:
04607       case FIL_PAGE_TYPE_ZBLOB2:
04608         break;
04609       default:
04610         ut_error;
04611       }
04612       next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
04613 
04614       btr_page_free_low(index, ext_block, 0, &mtr);
04615 
04616       if (UNIV_LIKELY(page_zip != NULL)) {
04617         mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
04618             next_page_no);
04619         mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
04620             0);
04621         page_zip_write_blob_ptr(page_zip, rec, index,
04622               offsets, i, &mtr);
04623       } else {
04624         mlog_write_ulint(field_ref
04625              + BTR_EXTERN_PAGE_NO,
04626              next_page_no,
04627              MLOG_4BYTES, &mtr);
04628         mlog_write_ulint(field_ref
04629              + BTR_EXTERN_LEN + 4, 0,
04630              MLOG_4BYTES, &mtr);
04631       }
04632     } else {
04633       ut_a(!page_zip);
04634       btr_check_blob_fil_page_type(space_id, page_no, page,
04635                  FALSE);
04636 
04637       next_page_no = mach_read_from_4(
04638         page + FIL_PAGE_DATA
04639         + BTR_BLOB_HDR_NEXT_PAGE_NO);
04640 
04641       /* We must supply the page level (= 0) as an argument
04642       because we did not store it on the page (we save the
04643       space overhead from an index page header. */
04644 
04645       btr_page_free_low(index, ext_block, 0, &mtr);
04646 
04647       mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
04648            next_page_no,
04649            MLOG_4BYTES, &mtr);
04650       /* Zero out the BLOB length.  If the server
04651       crashes during the execution of this function,
04652       trx_rollback_or_clean_all_recovered() could
04653       dereference the half-deleted BLOB, fetching a
04654       wrong prefix for the BLOB. */
04655       mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
04656            0,
04657            MLOG_4BYTES, &mtr);
04658     }
04659 
04660     /* Commit mtr and release the BLOB block to save memory. */
04661     btr_blob_free(ext_block, TRUE, &mtr);
04662   }
04663 }
04664 
04665 /***********************************************************/
04667 static
04668 void
04669 btr_rec_free_externally_stored_fields(
04670 /*==================================*/
04671   dict_index_t* index,  
04673   rec_t*    rec,  
04674   const ulint*  offsets,
04675   page_zip_des_t* page_zip,
04677   enum trx_rb_ctx rb_ctx, 
04678   mtr_t*    mtr)  
04681 {
04682   ulint n_fields;
04683   ulint i;
04684 
04685   ut_ad(rec_offs_validate(rec, index, offsets));
04686   ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
04687   /* Free possible externally stored fields in the record */
04688 
04689   ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
04690   n_fields = rec_offs_n_fields(offsets);
04691 
04692   for (i = 0; i < n_fields; i++) {
04693     if (rec_offs_nth_extern(offsets, i)) {
04694       ulint len;
04695       byte* data
04696         = rec_get_nth_field(rec, offsets, i, &len);
04697       ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
04698 
04699       btr_free_externally_stored_field(
04700         index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
04701         rec, offsets, page_zip, i, rb_ctx, mtr);
04702     }
04703   }
04704 }
04705 
04706 /***********************************************************/
04709 static
04710 void
04711 btr_rec_free_updated_extern_fields(
04712 /*===============================*/
04713   dict_index_t* index,  
04715   rec_t*    rec,  
04716   page_zip_des_t* page_zip,
04718   const ulint*  offsets,
04719   const upd_t*  update, 
04720   enum trx_rb_ctx rb_ctx, 
04721   mtr_t*    mtr)  
04723 {
04724   ulint n_fields;
04725   ulint i;
04726 
04727   ut_ad(rec_offs_validate(rec, index, offsets));
04728   ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
04729 
04730   /* Free possible externally stored fields in the record */
04731 
04732   n_fields = upd_get_n_fields(update);
04733 
04734   for (i = 0; i < n_fields; i++) {
04735     const upd_field_t* ufield = upd_get_nth_field(update, i);
04736 
04737     if (rec_offs_nth_extern(offsets, ufield->field_no)) {
04738       ulint len;
04739       byte* data = rec_get_nth_field(
04740         rec, offsets, ufield->field_no, &len);
04741       ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
04742 
04743       btr_free_externally_stored_field(
04744         index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
04745         rec, offsets, page_zip,
04746         ufield->field_no, rb_ctx, mtr);
04747     }
04748   }
04749 }
04750 
04751 /*******************************************************************/
04755 static
04756 ulint
04757 btr_copy_blob_prefix(
04758 /*=================*/
04759   byte*   buf,  
04761   ulint   len,  
04762   ulint   space_id,
04763   ulint   page_no,
04764   ulint   offset) 
04765 {
04766   ulint copied_len  = 0;
04767 
04768   for (;;) {
04769     mtr_t   mtr;
04770     buf_block_t*  block;
04771     const page_t* page;
04772     const byte* blob_header;
04773     ulint   part_len;
04774     ulint   copy_len;
04775 
04776     mtr_start(&mtr);
04777 
04778     block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
04779     buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
04780     page = buf_block_get_frame(block);
04781 
04782     btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
04783 
04784     blob_header = page + offset;
04785     part_len = btr_blob_get_part_len(blob_header);
04786     copy_len = ut_min(part_len, len - copied_len);
04787 
04788     memcpy(buf + copied_len,
04789            blob_header + BTR_BLOB_HDR_SIZE, copy_len);
04790     copied_len += copy_len;
04791 
04792     page_no = btr_blob_get_next_page_no(blob_header);
04793 
04794     mtr_commit(&mtr);
04795 
04796     if (page_no == FIL_NULL || copy_len != part_len) {
04797       UNIV_MEM_ASSERT_RW(buf, copied_len);
04798       return(copied_len);
04799     }
04800 
04801     /* On other BLOB pages except the first the BLOB header
04802     always is at the page data start: */
04803 
04804     offset = FIL_PAGE_DATA;
04805 
04806     ut_ad(copied_len <= len);
04807   }
04808 }
04809 
04810 /*******************************************************************/
04813 static
04814 void
04815 btr_copy_zblob_prefix(
04816 /*==================*/
04817   z_stream* d_stream,
04818   ulint   zip_size,
04819   ulint   space_id,
04820   ulint   page_no,
04821   ulint   offset) 
04822 {
04823   ulint page_type = FIL_PAGE_TYPE_ZBLOB;
04824 
04825   ut_ad(ut_is_2pow(zip_size));
04826   ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
04827   ut_ad(zip_size <= UNIV_PAGE_SIZE);
04828   ut_ad(space_id);
04829 
04830   for (;;) {
04831     buf_page_t* bpage;
04832     int   err;
04833     ulint   next_page_no;
04834 
04835     /* There is no latch on bpage directly.  Instead,
04836     bpage is protected by the B-tree page latch that
04837     is being held on the clustered index record, or,
04838     in row_merge_copy_blobs(), by an exclusive table lock. */
04839     bpage = buf_page_get_zip(space_id, zip_size, page_no);
04840 
04841     if (UNIV_UNLIKELY(!bpage)) {
04842       ut_print_timestamp(stderr);
04843       fprintf(stderr,
04844         "  InnoDB: Cannot load"
04845         " compressed BLOB"
04846         " page %lu space %lu\n",
04847         (ulong) page_no, (ulong) space_id);
04848       return;
04849     }
04850 
04851     if (UNIV_UNLIKELY
04852         (fil_page_get_type(bpage->zip.data) != page_type)) {
04853       ut_print_timestamp(stderr);
04854       fprintf(stderr,
04855         "  InnoDB: Unexpected type %lu of"
04856         " compressed BLOB"
04857         " page %lu space %lu\n",
04858         (ulong) fil_page_get_type(bpage->zip.data),
04859         (ulong) page_no, (ulong) space_id);
04860       goto end_of_blob;
04861     }
04862 
04863     next_page_no = mach_read_from_4(bpage->zip.data + offset);
04864 
04865     if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
04866       /* When the BLOB begins at page header,
04867       the compressed data payload does not
04868       immediately follow the next page pointer. */
04869       offset = FIL_PAGE_DATA;
04870     } else {
04871       offset += 4;
04872     }
04873 
04874     d_stream->next_in = bpage->zip.data + offset;
04875     d_stream->avail_in = zip_size - offset;
04876 
04877     err = inflate(d_stream, Z_NO_FLUSH);
04878     switch (err) {
04879     case Z_OK:
04880       if (!d_stream->avail_out) {
04881         goto end_of_blob;
04882       }
04883       break;
04884     case Z_STREAM_END:
04885       if (next_page_no == FIL_NULL) {
04886         goto end_of_blob;
04887       }
04888       /* fall through */
04889     default:
04890 inflate_error:
04891       ut_print_timestamp(stderr);
04892       fprintf(stderr,
04893         "  InnoDB: inflate() of"
04894         " compressed BLOB"
04895         " page %lu space %lu returned %d (%s)\n",
04896         (ulong) page_no, (ulong) space_id,
04897         err, d_stream->msg);
04898     case Z_BUF_ERROR:
04899       goto end_of_blob;
04900     }
04901 
04902     if (next_page_no == FIL_NULL) {
04903       if (!d_stream->avail_in) {
04904         ut_print_timestamp(stderr);
04905         fprintf(stderr,
04906           "  InnoDB: unexpected end of"
04907           " compressed BLOB"
04908           " page %lu space %lu\n",
04909           (ulong) page_no,
04910           (ulong) space_id);
04911       } else {
04912         err = inflate(d_stream, Z_FINISH);
04913         switch (err) {
04914         case Z_STREAM_END:
04915         case Z_BUF_ERROR:
04916           break;
04917         default:
04918           goto inflate_error;
04919         }
04920       }
04921 
04922 end_of_blob:
04923       buf_page_release_zip(bpage);
04924       return;
04925     }
04926 
04927     buf_page_release_zip(bpage);
04928 
04929     /* On other BLOB pages except the first
04930     the BLOB header always is at the page header: */
04931 
04932     page_no = next_page_no;
04933     offset = FIL_PAGE_NEXT;
04934     page_type = FIL_PAGE_TYPE_ZBLOB2;
04935   }
04936 }
04937 
04938 /*******************************************************************/
04943 static
04944 ulint
04945 btr_copy_externally_stored_field_prefix_low(
04946 /*========================================*/
04947   byte*   buf,  
04949   ulint   len,  
04950   ulint   zip_size,
04952   ulint   space_id,
04953   ulint   page_no,
04954   ulint   offset) 
04955 {
04956   if (UNIV_UNLIKELY(len == 0)) {
04957     return(0);
04958   }
04959 
04960   if (UNIV_UNLIKELY(zip_size)) {
04961     int   err;
04962     z_stream  d_stream;
04963     mem_heap_t* heap;
04964 
04965     /* Zlib inflate needs 32 kilobytes for the default
04966     window size, plus a few kilobytes for small objects. */
04967     heap = mem_heap_create(40000);
04968     page_zip_set_alloc(&d_stream, heap);
04969 
04970     err = inflateInit(&d_stream);
04971     ut_a(err == Z_OK);
04972 
04973     d_stream.next_out = buf;
04974     d_stream.avail_out = len;
04975     d_stream.avail_in = 0;
04976 
04977     btr_copy_zblob_prefix(&d_stream, zip_size,
04978               space_id, page_no, offset);
04979     inflateEnd(&d_stream);
04980     mem_heap_free(heap);
04981     UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
04982     return(d_stream.total_out);
04983   } else {
04984     return(btr_copy_blob_prefix(buf, len, space_id,
04985               page_no, offset));
04986   }
04987 }
04988 
04989 /*******************************************************************/
04994 UNIV_INTERN
04995 ulint
04996 btr_copy_externally_stored_field_prefix(
04997 /*====================================*/
04998   byte*   buf,  
04999   ulint   len,  
05000   ulint   zip_size,
05002   const byte* data, 
05006   ulint   local_len)
05007 {
05008   ulint space_id;
05009   ulint page_no;
05010   ulint offset;
05011 
05012   ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
05013 
05014   local_len -= BTR_EXTERN_FIELD_REF_SIZE;
05015 
05016   if (UNIV_UNLIKELY(local_len >= len)) {
05017     memcpy(buf, data, len);
05018     return(len);
05019   }
05020 
05021   memcpy(buf, data, local_len);
05022   data += local_len;
05023 
05024   ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
05025 
05026   if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
05027     /* The externally stored part of the column has been
05028     (partially) deleted.  Signal the half-deleted BLOB
05029     to the caller. */
05030 
05031     return(0);
05032   }
05033 
05034   space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
05035 
05036   page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
05037 
05038   offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
05039 
05040   return(local_len
05041          + btr_copy_externally_stored_field_prefix_low(buf + local_len,
05042                    len - local_len,
05043                    zip_size,
05044                    space_id, page_no,
05045                    offset));
05046 }
05047 
05048 /*******************************************************************/
05052 static
05053 byte*
05054 btr_copy_externally_stored_field(
05055 /*=============================*/
05056   ulint*    len,  
05057   const byte* data, 
05061   ulint   zip_size,
05063   ulint   local_len,
05064   mem_heap_t* heap) 
05065 {
05066   ulint space_id;
05067   ulint page_no;
05068   ulint offset;
05069   ulint extern_len;
05070   byte* buf;
05071 
05072   ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
05073 
05074   local_len -= BTR_EXTERN_FIELD_REF_SIZE;
05075 
05076   space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
05077 
05078   page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
05079 
05080   offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
05081 
05082   /* Currently a BLOB cannot be bigger than 4 GB; we
05083   leave the 4 upper bytes in the length field unused */
05084 
05085   extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
05086 
05087   buf = (unsigned char *)mem_heap_alloc(heap, local_len + extern_len);
05088 
05089   memcpy(buf, data, local_len);
05090   *len = local_len
05091     + btr_copy_externally_stored_field_prefix_low(buf + local_len,
05092                     extern_len,
05093                     zip_size,
05094                     space_id,
05095                     page_no, offset);
05096 
05097   return(buf);
05098 }
05099 
05100 /*******************************************************************/
05103 UNIV_INTERN
05104 byte*
05105 btr_rec_copy_externally_stored_field(
05106 /*=================================*/
05107   const rec_t*  rec,  
05109   const ulint*  offsets,
05110   ulint   zip_size,
05112   ulint   no, 
05113   ulint*    len,  
05114   mem_heap_t* heap) 
05115 {
05116   ulint   local_len;
05117   const byte* data;
05118 
05119   ut_a(rec_offs_nth_extern(offsets, no));
05120 
05121   /* An externally stored field can contain some initial
05122   data from the field, and in the last 20 bytes it has the
05123   space id, page number, and offset where the rest of the
05124   field data is stored, and the data length in addition to
05125   the data stored locally. We may need to store some data
05126   locally to get the local record length above the 128 byte
05127   limit so that field offsets are stored in two bytes, and
05128   the extern bit is available in those two bytes. */
05129 
05130   data = rec_get_nth_field(rec, offsets, no, &local_len);
05131 
05132   ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
05133 
05134   if (UNIV_UNLIKELY
05135       (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
05136          field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
05137     /* The externally stored field was not written yet.
05138     This record should only be seen by
05139     recv_recovery_rollback_active() or any
05140     TRX_ISO_READ_UNCOMMITTED transactions. */
05141     return(NULL);
05142   }
05143 
05144   return(btr_copy_externally_stored_field(len, data,
05145             zip_size, local_len, heap));
05146 }
05147 #endif /* !UNIV_HOTBACKUP */