''' Testbench and timing routines to figure-out the fastest schema for shelves.



Todo:

Delayed commits:
    How much insertion speed is gained by n-period commits
    How much does that slow-down searches
    Should iterators force a commit before iterating

Do 'select key', 'select *', and 'select value' all produce the same order?

With unique index or primary key or no index.
    Insertion speed (do we pay a high price for reindexing on every commit)
    Individual Lookup speed (regular value, masked key, uncommitted key)

'''

from random import *
from string import ascii_letters
import sqlite3
import os
from time import clock
from itertools import chain

MAKE_SHELF = ('''
    DROP INDEX IF EXISTS keyndx; DROP TABLE IF EXISTS shelf;
    CREATE TABLE IF NOT EXISTS shelf (key TEXT NOT NULL, value TEXT NOT NULL);
''', 'UNINDEXED')

MAKE_SHELF_PRIMARY = ('''
    DROP INDEX IF EXISTS keyndx; DROP TABLE IF EXISTS shelf;
    CREATE TABLE IF NOT EXISTS shelf (key PRIMARY KEY, value TEXT NOT NULL);
''', 'PRIMARY')

MAKE_SHELF_UNIQUE = ('''
    DROP INDEX IF EXISTS keyndx; DROP TABLE IF EXISTS shelf;
    CREATE TABLE IF NOT EXISTS shelf (key TEXT NOT NULL, value TEXT NOT NULL);
    CREATE UNIQUE INDEX IF NOT EXISTS keyndx ON shelf (key);
''', 'UNIQ')


MAKE_SHELF = ('''
    DROP INDEX IF EXISTS keyndx; DROP TABLE IF EXISTS shelf;
    CREATE TABLE IF NOT EXISTS shelf (key BLOB NOT NULL, value BLOB NOT NULL);
''', 'UNINDEXED')
MAKE_SHELF_PRIMARY = ('''
    DROP INDEX IF EXISTS keyndx; DROP TABLE IF EXISTS shelf;
    CREATE TABLE IF NOT EXISTS shelf (key BLOB PRIMARY KEY, value BLOB NOT NULL);
''', 'PRIMARY')
MAKE_SHELF_UNIQUE = ('''
    DROP INDEX IF EXISTS keyndx; DROP TABLE IF EXISTS shelf;
    CREATE TABLE IF NOT EXISTS shelf (key BLOB NOT NULL, value BLOB NOT NULL);
    CREATE UNIQUE INDEX IF NOT EXISTS keyndx ON shelf (key);
''', 'UNIQ')


SELECTORS = [
    'SELECT * FROM shelf',
    'SELECT * FROM shelf ORDER BY key',
    'SELECT * FROM shelf ORDER BY rowid',
    'SELECT key FROM shelf',
    'SELECT value FROM shelf',
    'SELECT key, value FROM shelf',
    'SELECT COUNT(*) FROM shelf',
    'SELECT MAX(ROWID) FROM shelf',
]

FINDKEY = [
    'SELECT key FROM shelf WHERE key = ?',
    'SELECT value FROM shelf WHERE key = ?',
    'SELECT 1 FROM shelf WHERE key = ?',
]

def populate(n=100, m=5):
    items = []
    for i in range(n):
        key = bytes(''.join(choice(ascii_letters) for j in range(m)), 'utf-8')
        val = bytes(''.join(choice(ascii_letters) for j in range(m)), 'utf-8')
        items += [(key, val)]
    if len(set(k for k,v in items)) != n:
        return populate(n, m)
    return items

def setup(BUILDER, items=None):
    filename = 'tmpshl'
    try:
        os.remove(filename)
    except OSError:
        pass
    conn = sqlite3.connect(filename)
    conn.text_factory = bytes
    conn.executescript(BUILDER)
    if items:
        UPDATE_ITEMS = 'REPLACE INTO shelf (key, value) VALUES (?, ?)'
        conn.executemany(UPDATE_ITEMS, items)
    conn.commit()
    return conn

def fragmentit(conn, addlist, dellist, n=20):
    # Make random adds and deletes, n at time
    addlist = addlist[:]
    dellist = dellist[:]
    while len(addlist) >= n and len(dellist) >= n:
        for i in range(n):
            conn.execute('DELETE FROM shelf WHERE ROWID = ?', dellist.pop())
            conn.commit()
        for i in range(n):
            conn.execute('REPLACE INTO shelf (key, value) VALUES (?, ?)', addlist.pop())
            conn.commit()

def timeit(conn, stmt, args=(), n=20):
    if args:
        conn.execute(stmt, args)    # precompile
    else:
        conn.execute(stmt)          # precompile
    start = clock()
    if args:
        for i in range(n):
            conn.execute(stmt, args).fetchone()
    else:
        for i in range(n):
            conn.execute(stmt).fetchall()
    return '%.2f' % (clock() - start)

n, m = 2000, 6000

seed('xyzpdqbingo')
items = populate(n)
if 1:
    fragment, vacuum = False, False
##  dellist = [(randrange(n),) for i in range(m)]
##  addlist = populate(m)
    for stmt in SELECTORS:
        print(stmt)
        for builder, name in [MAKE_SHELF, MAKE_SHELF_PRIMARY, MAKE_SHELF_UNIQUE]:
            conn = setup(builder, items)
            if fragment:
                fragmentit(conn, addlist, dellist)
            if vacuum:
                conn.execute('VACUUM')
            print(sorted(timeit(conn, stmt, (), n=10000) for i in range(6)), name)
        print()
else:
    pairs = sample(items, 3)
    conn = setup(MAKE_SHELF_PRIMARY[0], items)
    for stmt in FINDKEY:
        print(stmt)
        for key in chain.from_iterable(pairs):
            print(sorted(timeit(conn, stmt, (key,), n=10000) for i in range(6)), key)
        print()




''' Results:

No difference between 'select *', 'select k,v', and 'select * by rowid'
    Doesn't matter if the tables are unindexed, have a primary key, or a unique index.
Fragmented leaves timing ratios the same but takes 3.34 times longer.
Running a VACUUM after fragmenting doesn't speed things up.
"order by key" is twice as slow as unordered, even with indexing.
The effect of "primary key" and a unique index is the same
BLOB and TEXT types have the same timing

Search for a single key:
    Missing key search 8% faster than a found key
    "SELECT 1" is 3% faster than "SELECT key" which is 3% faster than "SELECT value"

Finding the length of the table:
    "SELECT MAX(ROWID)" beats "SELECT COUNT(*)"
    Disassembly shows the former goes straight to the last record,
    while the latter does a full table scan.

-----  Unfragmented:  n, m = 2000, 6000

SELECT * FROM shelf
['0.38', '0.38', '0.38', '0.39', '0.39', '0.47'] UNINDEXED
['0.38', '0.38', '0.39', '0.39', '0.39', '0.41'] PRIMARY
['0.38', '0.38', '0.39', '0.39', '0.41', '0.42'] UNIQ

SELECT * FROM shelf ORDER BY key
['1.63', '1.63', '1.63', '1.64', '1.64', '1.74'] UNINDEXED
['0.57', '0.57', '0.57', '0.58', '0.58', '0.63'] PRIMARY
['0.57', '0.57', '0.57', '0.58', '0.58', '0.71'] UNIQ

SELECT * FROM shelf ORDER BY rowid
['0.38', '0.39', '0.39', '0.39', '0.39', '0.47'] UNINDEXED
['0.38', '0.38', '0.39', '0.39', '0.40', '0.43'] PRIMARY
['0.38', '0.38', '0.39', '0.39', '0.39', '0.44'] UNIQ

SELECT key FROM shelf
['0.26', '0.26', '0.26', '0.26', '0.26', '0.29'] UNINDEXED
['0.26', '0.26', '0.26', '0.26', '0.26', '0.29'] PRIMARY
['0.26', '0.26', '0.26', '0.26', '0.27', '0.35'] UNIQ

SELECT value FROM shelf
['0.26', '0.26', '0.26', '0.27', '0.27', '0.30'] UNINDEXED
['0.26', '0.26', '0.26', '0.26', '0.27', '0.28'] PRIMARY
['0.26', '0.26', '0.26', '0.26', '0.26', '0.32'] UNIQ

SELECT key, value FROM shelf
['0.38', '0.38', '0.38', '0.38', '0.39', '0.44'] UNINDEXED
['0.38', '0.38', '0.38', '0.39', '0.40', '0.43'] PRIMARY
['0.38', '0.38', '0.38', '0.39', '0.39', '0.47'] UNIQ


----- Fragmented:  n, m = 2000, 6000

SELECT * FROM shelf
['1.27', '1.28', '1.29', '1.30', '1.32', '1.39'] UNINDEXED
['1.29', '1.30', '1.31', '1.31', '1.31', '1.45'] PRIMARY
['1.29', '1.29', '1.29', '1.30', '1.31', '1.45'] UNIQ

SELECT * FROM shelf ORDER BY key
['5.55', '5.58', '5.58', '5.59', '5.70', '5.75'] UNINDEXED
['1.91', '1.92', '1.92', '1.92', '1.93', '2.02'] PRIMARY
['1.91', '1.92', '1.94', '1.94', '2.08', '2.10'] UNIQ

SELECT * FROM shelf ORDER BY rowid
['1.30', '1.30', '1.30', '1.31', '1.42', '1.48'] UNINDEXED
['1.28', '1.29', '1.29', '1.30', '1.30', '1.42'] PRIMARY
['1.29', '1.30', '1.31', '1.31', '1.39', '1.40'] UNIQ

SELECT key FROM shelf
['0.90', '0.91', '0.91', '0.92', '0.92', '1.07'] UNINDEXED
['0.89', '0.91', '0.91', '0.91', '0.91', '1.02'] PRIMARY
['0.90', '0.91', '0.91', '0.92', '0.93', '1.06'] UNIQ

SELECT value FROM shelf
['0.90', '0.91', '0.91', '0.91', '0.93', '1.06'] UNINDEXED
['0.89', '0.91', '0.91', '0.91', '0.92', '1.00'] PRIMARY
['0.90', '0.91', '0.91', '0.91', '0.92', '0.99'] UNIQ

SELECT key, value FROM shelf
['1.29', '1.30', '1.30', '1.30', '1.31', '1.45'] UNINDEXED
['1.29', '1.29', '1.29', '1.29', '1.30', '1.41'] PRIMARY
['1.29', '1.29', '1.30', '1.30', '1.30', '1.45'] UNIQ


----- Fragmented, then vacuumed:  n, m = 2000, 6000

SELECT * FROM shelf
['1.29', '1.29', '1.31', '1.31', '1.40', '1.49'] UNINDEXED
['1.32', '1.32', '1.32', '1.33', '1.33', '1.52'] PRIMARY
['1.28', '1.29', '1.31', '1.31', '1.32', '1.47'] UNIQ

SELECT * FROM shelf ORDER BY key
['5.76', '5.77', '5.77', '5.77', '5.77', '5.81'] UNINDEXED
['1.91', '1.91', '1.91', '1.91', '1.92', '2.02'] PRIMARY
['1.89', '1.90', '1.91', '1.92', '1.92', '2.07'] UNIQ

SELECT * FROM shelf ORDER BY rowid
['1.35', '1.37', '1.38', '1.51', '1.53', '1.56'] UNINDEXED
['1.28', '1.29', '1.30', '1.30', '1.30', '1.42'] PRIMARY
['1.32', '1.33', '1.33', '1.34', '1.34', '1.50'] UNIQ

SELECT key FROM shelf
['0.88', '0.89', '0.90', '0.90', '0.90', '1.09'] UNINDEXED
['0.90', '0.91', '0.91', '0.92', '0.92', '1.02'] PRIMARY
['0.89', '0.89', '0.89', '0.89', '0.90', '1.01'] UNIQ

SELECT value FROM shelf
['0.89', '0.90', '0.91', '0.91', '0.91', '1.10'] UNINDEXED
['0.91', '0.91', '0.91', '0.91', '1.03', '1.08'] PRIMARY
['0.90', '0.91', '0.92', '0.92', '0.92', '1.00'] UNIQ

SELECT key, value FROM shelf
['1.29', '1.30', '1.30', '1.30', '1.31', '1.46'] UNINDEXED
['1.29', '1.29', '1.30', '1.30', '1.31', '1.38'] PRIMARY
['1.28', '1.30', '1.30', '1.30', '1.33', '1.40'] UNIQ


----- Search for a single key (alternating between found and not found) ----

SELECT key FROM shelf WHERE key = ?
['0.59', '0.59', '0.59', '0.59', '0.60', '0.64'] b'YzLgm'
['0.55', '0.56', '0.56', '0.56', '0.56', '0.58'] b'rsgVk'
['0.59', '0.59', '0.59', '0.59', '0.59', '0.59'] b'Echad'
['0.55', '0.56', '0.56', '0.56', '0.56', '0.56'] b'uMwIL'
['0.59', '0.59', '0.59', '0.59', '0.59', '0.60'] b'jgmOI'
['0.55', '0.55', '0.56', '0.56', '0.56', '0.56'] b'oNJuV'

SELECT value FROM shelf WHERE key = ?
['0.61', '0.62', '0.62', '0.62', '0.62', '0.62'] b'YzLgm'
['0.56', '0.57', '0.57', '0.57', '0.57', '0.57'] b'rsgVk'
['0.61', '0.61', '0.61', '0.62', '0.62', '0.62'] b'Echad'
['0.57', '0.57', '0.57', '0.57', '0.57', '0.58'] b'uMwIL'
['0.61', '0.61', '0.61', '0.62', '0.62', '0.62'] b'jgmOI'
['0.56', '0.56', '0.57', '0.57', '0.57', '0.57'] b'oNJuV'

SELECT 1 FROM shelf WHERE key = ?
['0.57', '0.57', '0.58', '0.58', '0.58', '0.58'] b'YzLgm'
['0.55', '0.55', '0.55', '0.55', '0.55', '0.56'] b'rsgVk'
['0.57', '0.57', '0.57', '0.58', '0.58', '0.58'] b'Echad'
['0.55', '0.55', '0.55', '0.56', '0.56', '0.56'] b'uMwIL'
['0.57', '0.57', '0.57', '0.58', '0.58', '0.58'] b'jgmOI'
['0.55', '0.55', '0.55', '0.56', '0.56', '0.57'] b'oNJuV'


----- Find length of the table -----

SELECT COUNT(*) FROM shelf
['2.36', '2.37', '2.37', '2.37', '2.39', '2.47'] PRIMARY

SELECT MAX(ROWID) FROM shelf
['0.50', '0.50', '0.50', '0.50', '0.51', '0.55'] PRIMARY
'''
