From cd4c6002c69aaff59053699b0e1a9fadd33a2c90 Mon Sep 17 00:00:00 2001 From: Timothy Warren Date: Tue, 30 Apr 2019 16:07:37 -0400 Subject: [PATCH] Step 10 --- db.c | 211 +++++++++++++++++++++++++++++++++--- spec/main_spec.rb | 264 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 403 insertions(+), 72 deletions(-) diff --git a/db.c b/db.c index a62dd1a..bbdff91 100644 --- a/db.c +++ b/db.c @@ -109,6 +109,22 @@ const uint32_t PARENT_POINTER_SIZE = sizeof(uint32_t); const uint32_t PARENT_POINTER_OFFSET = IS_ROOT_OFFSET + IS_ROOT_SIZE; const uint32_t COMMON_NODE_HEADER_SIZE = NODE_TYPE_SIZE + IS_ROOT_SIZE + PARENT_POINTER_SIZE; +/* + * Internal Node Header Layout + */ +const uint32_t INTERNAL_NODE_NUM_KEYS_SIZE = sizeof(uint32_t); +const uint32_t INTERNAL_NODE_NUM_KEYS_OFFSET = COMMON_NODE_HEADER_SIZE; +const uint32_t INTERNAL_NODE_RIGHT_CHILD_SIZE = sizeof(uint32_t); +const uint32_t INTERNAL_NODE_RIGHT_CHILD_OFFSET = INTERNAL_NODE_NUM_KEYS_OFFSET + INTERNAL_NODE_NUM_KEYS_SIZE; +const uint32_t INTERNAL_NODE_HEADER_SIZE = COMMON_NODE_HEADER_SIZE + INTERNAL_NODE_NUM_KEYS_SIZE + INTERNAL_NODE_RIGHT_CHILD_SIZE; + +/* + * Internal Node Body Layout + */ +const uint32_t INTERNAL_NODE_KEY_SIZE = sizeof(uint32_t); +const uint32_t INTERNAL_NODE_CHILD_SIZE = sizeof(uint32_t); +const uint32_t INTERNAL_NODE_CELL_SIZE = INTERNAL_NODE_CHILD_SIZE + INTERNAL_NODE_KEY_SIZE; + /* * Leaf Node Header Layout */ @@ -126,6 +142,8 @@ const uint32_t LEAF_NODE_VALUE_OFFSET = LEAF_NODE_KEY_OFFSET + LEAF_NODE_KEY_SIZ const uint32_t LEAF_NODE_CELL_SIZE = LEAF_NODE_KEY_SIZE + LEAF_NODE_VALUE_SIZE; const uint32_t LEAF_NODE_SPACE_FOR_CELLS = PAGE_SIZE - LEAF_NODE_HEADER_SIZE; const uint32_t LEAF_NODE_MAX_CELLS = LEAF_NODE_SPACE_FOR_CELLS / LEAF_NODE_CELL_SIZE; +const uint32_t LEAF_NODE_RIGHT_SPLIT_COUNT = (LEAF_NODE_MAX_CELLS + 1) / 2; +const uint32_t LEAF_NODE_LEFT_SPLIT_COUNT = (LEAF_NODE_MAX_CELLS + 1) - LEAF_NODE_RIGHT_SPLIT_COUNT; NodeType get_node_type(void* node) { uint8_t value = *((uint8_t*)(node + NODE_TYPE_OFFSET)); @@ -137,6 +155,44 @@ void set_node_type(void* node, NodeType type) { *((uint8_t*)(node + NODE_TYPE_OFFSET)) = value; } +bool is_node_root(void* node) { + uint8_t value = *((uint8_t*)(node + IS_ROOT_OFFSET)); + return (bool)value; +} + +void set_node_root(void* node, bool is_root) { + uint8_t value = is_root; + *((uint8_t*)(node + IS_ROOT_OFFSET)) = value; +} + +uint32_t* internal_node_num_keys(void* node) { + return node + INTERNAL_NODE_NUM_KEYS_OFFSET; +} + +uint32_t* internal_node_right_child(void* node) { + return node + INTERNAL_NODE_RIGHT_CHILD_OFFSET; +} + +uint32_t* internal_node_cell(void* node, uint32_t cell_num) { + return node + INTERNAL_NODE_HEADER_SIZE + cell_num * INTERNAL_NODE_CELL_SIZE; +} + +uint32_t* internal_node_child(void* node, uint32_t child_num) { + uint32_t num_keys = *internal_node_num_keys(node); + if (child_num > num_keys) { + printf("Tried to access child_num %d > num_keys %d\n", child_num, num_keys); + exit(EXIT_FAILURE); + } else if (child_num == num_keys) { + return internal_node_right_child(node); + } else { + return internal_node_cell(node, child_num); + } +} + +uint32_t* internal_node_key(void* node, uint32_t key_num) { + return internal_node_cell(node, key_num) + INTERNAL_NODE_CHILD_SIZE; +} + uint32_t* leaf_node_num_cells(void* node) { return (char *)node + LEAF_NODE_NUM_CELLS_OFFSET; } @@ -153,6 +209,15 @@ void* leaf_node_value(void* node, uint32_t cell_num) { return leaf_node_cell(node, cell_num) + LEAF_NODE_KEY_SIZE; } +uint32_t get_node_max_key(void* node) { + switch (get_node_type(node)) { + case NODE_INTERNAL: + return *internal_node_key(node, *internal_node_num_keys(node) - 1); + case NODE_LEAF: + return *leaf_node_key(node, *leaf_node_num_cells(node) - 1); + } +} + void print_constants() { printf("ROW_SIZE: %d\n", ROW_SIZE); printf("COMMON_NODE_HEADER_SIZE: %d\n", COMMON_NODE_HEADER_SIZE); @@ -197,6 +262,43 @@ void* get_page(Pager* pager, uint32_t page_num) { return pager->pages[page_num]; } +void indent(uint32_t level) { + for (uint32_t i = 0; i < level; i++) { + printf(" "); + } +} + +void print_tree(Pager* pager, uint32_t page_num, uint32_t indentation_level) { + void* node = get_page(pager, page_num); + uint32_t num_keys, child; + + switch(get_node_type(node)) { + case (NODE_LEAF): + num_keys = *leaf_node_num_cells(node); + indent(indentation_level); + printf("- leaf (size %d\n", num_keys); + for (uint32_t i = 0; i < num_keys; i++) { + indent(indentation_level + 1); + printf("- %d\n", *leaf_node_key(node, i)); + } + break; + case (NODE_INTERNAL): + num_keys = *internal_node_num_keys(node); + indent(indentation_level); + printf("- internal (size %d)\n", num_keys); + for (uint32_t i = 0; i < num_keys; i++) { + child = *internal_node_child(node, i); + print_tree(pager, child, indentation_level + 1); + + indent(indentation_level + 1); + printf("- key %d\n", *internal_node_key(node, i)); + } + child = *internal_node_right_child(node); + print_tree(pager, child, indentation_level + 1); + break; + } +} + void serialize_row(Row* source, void* destination) { memcpy(destination + ID_OFFSET, &(source->id), ID_SIZE); memcpy(destination + USERNAME_OFFSET, &(source->username), USERNAME_SIZE); @@ -211,9 +313,16 @@ void deserialize_row(void* source, Row* destination) { void initialize_leaf_node(void* node) { set_node_type(node, NODE_LEAF); + set_node_root(node, false); *leaf_node_num_cells(node) = 0; } +void initialize_internal_node(void* node) { + set_node_type(node, NODE_INTERNAL); + set_node_root(node, false); + *internal_node_num_keys(node) = 0; +} + Cursor* leaf_node_find(Table* table, uint32_t page_num, uint32_t key) { void* node = get_page(table->pager, page_num); uint32_t num_cells = *leaf_node_num_cells(node); @@ -328,6 +437,7 @@ Table* db_open(const char* filename) { // New database file. Initialize page 0 as leaf node. void* root_node = get_page(pager, 0); initialize_leaf_node(root_node); + set_node_root(root_node, true); } return table; @@ -412,22 +522,13 @@ void db_close(Table* table) { free(pager); } -void print_leaf_node(void* node) { - uint32_t num_cells = *leaf_node_num_cells(node); - printf("leaf (size %d)\n", num_cells); - for (uint32_t i = 0; i < num_cells; i++) { - uint32_t key = *leaf_node_key(node, i); - printf(" - %d : %d\n", i, key); - } -} - MetaCommandResult do_meta_command(InputBuffer* input_buffer, Table* table) { if (strcmp(input_buffer->buffer, ".exit") == 0) { db_close(table); exit(EXIT_SUCCESS); } else if (strcmp(input_buffer->buffer, ".btree") == 0){ printf("Tree:\n"); - print_leaf_node(get_page(table->pager, 0)); + print_tree(table->pager, 0, 0); return META_COMMAND_SUCCESS; } else if (strcmp(input_buffer->buffer, ".constants") == 0) { printf("Constants:\n"); @@ -480,14 +581,97 @@ PrepareResult prepare_statement(InputBuffer* input_buffer, Statement* statement) return PREPARE_UNRECOGNIZED_STATEMENT; } +/* + * Until we start recycling free pages, new pages will always + * go onto the end of the database file + */ +uint32_t get_unused_page_num(Pager * pager) { + return pager->num_pages; +} + +void create_new_root(Table* table, uint32_t right_child_page_num) { + /* + * Handle splitting the root. + * Old root copied to new page, becomes left child. + * Address of right child passed in. + * Re-initialize root page to contain the new root node. + * New root points to two children. + */ + + void* root = get_page(table->pager, table->root_page_num); + void* right_child = get_page(table->pager, right_child_page_num); + uint32_t left_child_page_num = get_unused_page_num(table->pager); + void* left_child = get_page(table->pager, left_child_page_num); + + /* Left child has data copied from old root */ + memcpy(left_child, root, PAGE_SIZE); + set_node_root(left_child, false); + + /* Root node is a new internal node with one key and two children */ + initialize_internal_node(root); + set_node_root(root, true); + *internal_node_num_keys(root) = 1; + *internal_node_child(root, 0) = left_child_page_num; + uint32_t left_child_max_key = get_node_max_key(left_child); + *internal_node_key(root, 0) = left_child_max_key; + *internal_node_right_child(root) = right_child_page_num; +} + +void leaf_node_split_and_insert(Cursor* cursor, uint32_t key, Row* value) { + /* + * Create a new node and move half the cells over. + * Insert the new value in one of the two nodes. + * Update parent or create a new parent. + */ + void* old_node = get_page(cursor->table->pager, cursor->page_num); + uint32_t new_page_num = get_unused_page_num(cursor->table->pager); + void* new_node = get_page(cursor->table->pager, new_page_num); + initialize_leaf_node(new_node); + + /* + * All existing keys plus new key should be divided + * evenly between old (left) and new (right) nodes. + * Starting from the right, move each key to correct position. + */ + for (int32_t i = LEAF_NODE_MAX_CELLS; i >= 0; i--) { + void* destination_node; + if (i >+ LEAF_NODE_LEFT_SPLIT_COUNT) { + destination_node = new_node; + } else { + destination_node = old_node; + } + uint32_t index_within_node = i % LEAF_NODE_LEFT_SPLIT_COUNT; + void* destination = leaf_node_cell(destination_node, index_within_node); + + if (i == cursor->cell_num) { + serialize_row(value, destination); + } else if (i > cursor->cell_num) { + memcpy(destination, leaf_node_cell(old_node, i - 1), LEAF_NODE_CELL_SIZE); + } else { + memcpy(destination, leaf_node_cell(old_node, i), LEAF_NODE_CELL_SIZE); + } + } + + /* Update cell count on both leaf nodes */ + *(leaf_node_num_cells(old_node)) = LEAF_NODE_LEFT_SPLIT_COUNT; + *(leaf_node_num_cells(new_node)) = LEAF_NODE_RIGHT_SPLIT_COUNT; + + if (is_node_root(old_node)) { + return create_new_root(cursor->table, new_page_num); + } else { + printf("Need to implement updating parent after split\n"); + exit(EXIT_FAILURE); + } +} + void leaf_node_insert(Cursor* cursor, uint32_t key, Row* value) { void* node = get_page(cursor->table->pager, cursor->page_num); uint32_t num_cells = *leaf_node_num_cells(node); if (num_cells >= LEAF_NODE_MAX_CELLS) { // Node full - printf("Need to implement splitting a leaf node.\n"); - exit(EXIT_FAILURE); + leaf_node_split_and_insert(cursor, key, value); + return; } if (cursor->cell_num < num_cells) { @@ -506,9 +690,6 @@ void leaf_node_insert(Cursor* cursor, uint32_t key, Row* value) { ExecuteResult execute_insert(Statement* statement, Table* table) { void* node = get_page(table->pager, table->root_page_num); uint32_t num_cells = (*leaf_node_num_cells(node)); - if (num_cells >= LEAF_NODE_MAX_CELLS) { - return EXECUTE_TABLE_FULL; - } Row* row_to_insert = &(statement->row_to_insert); uint32_t key_to_insert = row_to_insert->id; diff --git a/spec/main_spec.rb b/spec/main_spec.rb index 8260b2c..4dbb826 100644 --- a/spec/main_spec.rb +++ b/spec/main_spec.rb @@ -7,7 +7,11 @@ describe 'database' do raw_output = nil IO.popen("./db test.db", "r+") do |pipe| commands.each do |command| - pipe.puts command + begin + pipe.puts command + rescue Errno::EPIPE + break + end end pipe.close_write @@ -24,7 +28,7 @@ describe 'database' do "select", ".exit", ]) - expect(result).to eq([ + expect(result).to match_array([ "db > Executed.", "db > (1, user1, person1@example.com)", "Executed.", @@ -32,13 +36,37 @@ describe 'database' do ]) end + it 'keeps data after closing connection' do + result1 = run_script([ + "insert 1 user1 person1@example.com", + ".exit", + ]) + expect(result1).to match_array([ + "db > Executed.", + "db > ", + ]) + + result2 = run_script([ + "select", + ".exit", + ]) + expect(result2).to match_array([ + "db > (1, user1, person1@example.com)", + "Executed.", + "db > ", + ]) + end + it 'prints error message when table is full' do script = (1..1401).map do |i| "insert #{i} user#{i} person#{i}@example.com" end script << ".exit" result = run_script(script) - expect(result[-2]).to eq('db > Error: Table full.') + expect(result.last(2)).to match_array([ + "db > Executed.", + "db > Need to implement splitting internal node", + ]) end it 'allows inserting strings that are the maximum length' do @@ -50,7 +78,7 @@ describe 'database' do ".exit", ] result = run_script(script) - expect(result).to eq([ + expect(result).to match_array([ "db > Executed.", "db > (1, #{long_username}, #{long_email})", "Executed.", @@ -67,7 +95,7 @@ describe 'database' do ".exit", ] result = run_script(script) - expect(result).to eq([ + expect(result).to match_array([ "db > String is too long.", "db > Executed.", "db > ", @@ -81,88 +109,210 @@ describe 'database' do ".exit", ] result = run_script(script) - expect(result).to eq([ + expect(result).to match_array([ "db > ID must be positive.", "db > Executed.", "db > ", ]) end - it 'keeps data after closing connection' do - result1 = run_script([ + it 'prints an error message if there is a duplicate id' do + script = [ + "insert 1 user1 person1@example.com", "insert 1 user1 person1@example.com", - ".exit", - ]) - expect(result1).to eq([ - "db > Executed.", - "db > ", - ]) - result2 = run_script([ "select", ".exit", - ]) - expect(result2).to eq([ + ] + result = run_script(script) + expect(result).to match_array([ + "db > Executed.", + "db > Error: Duplicate key.", "db > (1, user1, person1@example.com)", "Executed.", "db > ", ]) end - it 'prints constants' do - script = [ - ".constants", - ".exit", - ] - result = run_script(script) - - expect(result).to match_array([ - "db > Constants:", - "ROW_SIZE: 293", - "COMMON_NODE_HEADER_SIZE: 6", - "LEAF_NODE_HEADER_SIZE: 10", - "LEAF_NODE_CELL_SIZE: 297", - "LEAF_NODE_SPACE_FOR_CELLS: 4086", - "LEAF_NODE_MAX_CELLS: 13", - "db > ", - ]) - end - it 'allows printing out the structure of a one-node btree' do script = [3, 1, 2].map do |i| - "insert #{i} user#{i} person#{i}@example.com" + "insert #{i} user#{i} person#{i}@example.com" end - script << ".btree" script << ".exit" result = run_script(script) expect(result).to match_array([ - "db > Executed.", - "db > Executed.", - "db > Executed.", - "db > Tree:", - "leaf (size 3)", - " - 0 : 1", - " - 1 : 2", - " - 2 : 3", - "db > " + "db > Executed.", + "db > Executed.", + "db > Executed.", + "db > Tree:", + "- leaf (size 3)", + " - 1", + " - 2", + " - 3", + "db > " ]) end - it 'prints an error message if there is a duplicate id' do + it 'allows printing out the structure of a 3-leaf-node btree' do + script = (1..14).map do |i| + "insert #{i} user#{i} person#{i}@example.com" + end + script << ".btree" + script << "insert 15 user15 person15@example.com" + script << ".exit" + result = run_script(script) + + expect(result[14...(result.length)]).to match_array([ + "db > Tree:", + "- internal (size 1)", + " - leaf (size 7)", + " - 1", + " - 2", + " - 3", + " - 4", + " - 5", + " - 6", + " - 7", + " - key 7", + " - leaf (size 7)", + " - 8", + " - 9", + " - 10", + " - 11", + " - 12", + " - 13", + " - 14", + "db > Executed.", + "db > ", + ]) + end + + it 'allows printing out the structure of a 4-leaf-node btree' do script = [ - "insert 1 user1 person1@example.com", - "insert 1 user1 person1@example.com", - "select", - ".exit", + "insert 18 user18 person18@example.com", + "insert 7 user7 person7@example.com", + "insert 10 user10 person10@example.com", + "insert 29 user29 person29@example.com", + "insert 23 user23 person23@example.com", + "insert 4 user4 person4@example.com", + "insert 14 user14 person14@example.com", + "insert 30 user30 person30@example.com", + "insert 15 user15 person15@example.com", + "insert 26 user26 person26@example.com", + "insert 22 user22 person22@example.com", + "insert 19 user19 person19@example.com", + "insert 2 user2 person2@example.com", + "insert 1 user1 person1@example.com", + "insert 21 user21 person21@example.com", + "insert 11 user11 person11@example.com", + "insert 6 user6 person6@example.com", + "insert 20 user20 person20@example.com", + "insert 5 user5 person5@example.com", + "insert 8 user8 person8@example.com", + "insert 9 user9 person9@example.com", + "insert 3 user3 person3@example.com", + "insert 12 user12 person12@example.com", + "insert 27 user27 person27@example.com", + "insert 17 user17 person17@example.com", + "insert 16 user16 person16@example.com", + "insert 13 user13 person13@example.com", + "insert 24 user24 person24@example.com", + "insert 25 user25 person25@example.com", + "insert 28 user28 person28@example.com", + ".btree", + ".exit", ] result = run_script(script) + + expect(result[30...(result.length)]).to match_array([ + "db > Tree:", + "- internal (size 3)", + " - leaf (size 7)", + " - 1", + " - 2", + " - 3", + " - 4", + " - 5", + " - 6", + " - 7", + " - key 7", + " - leaf (size 8)", + " - 8", + " - 9", + " - 10", + " - 11", + " - 12", + " - 13", + " - 14", + " - 15", + " - key 15", + " - leaf (size 7)", + " - 16", + " - 17", + " - 18", + " - 19", + " - 20", + " - 21", + " - 22", + " - key 22", + " - leaf (size 8)", + " - 23", + " - 24", + " - 25", + " - 26", + " - 27", + " - 28", + " - 29", + " - 30", + "db > ", + ]) + end + + it 'prints constants' do + script = [ + ".constants", + ".exit", + ] + result = run_script(script) + expect(result).to match_array([ - "db > Executed.", - "db > Error: Duplicate key.", - "db > (1, user1, person1@example.com)", - "Executed.", - "db > ", + "db > Constants:", + "ROW_SIZE: 293", + "COMMON_NODE_HEADER_SIZE: 6", + "LEAF_NODE_HEADER_SIZE: 14", + "LEAF_NODE_CELL_SIZE: 297", + "LEAF_NODE_SPACE_FOR_CELLS: 4082", + "LEAF_NODE_MAX_CELLS: 13", + "db > ", + ]) + end + + it 'prints all rows in a multi-level tree' do + script = [] + (1..15).each do |i| + script << "insert #{i} user#{i} person#{i}@example.com" + end + script << "select" + script << ".exit" + result = run_script(script) + expect(result[15...result.length]).to match_array([ + "db > (1, user1, person1@example.com)", + "(2, user2, person2@example.com)", + "(3, user3, person3@example.com)", + "(4, user4, person4@example.com)", + "(5, user5, person5@example.com)", + "(6, user6, person6@example.com)", + "(7, user7, person7@example.com)", + "(8, user8, person8@example.com)", + "(9, user9, person9@example.com)", + "(10, user10, person10@example.com)", + "(11, user11, person11@example.com)", + "(12, user12, person12@example.com)", + "(13, user13, person13@example.com)", + "(14, user14, person14@example.com)", + "(15, user15, person15@example.com)", + "Executed.", "db > ", ]) end end