.. _importdata-py:

#############
importdata.py
#############

***************************
class DataImporter(object):
***************************

Importing data from 'inlined' format; i.e. the data for each level in one file::

    >>> from simo.input.importdata import DataImporter
    >>> execfile('input/test/mock4importdata.py')
    >>> #from simo.input.test.mock4importdata import *
    >>> imp = DataImporter(inputdb, mapping, importdate, 
    ...                    logger, logname, lexicon, 100)
    >>> imp.import_data('inlined', [inline], 'simulation')
    ...     # doctest: +ELLIPSIS
    ...     # doctest: +NORMALIZE_WHITESPACE
    Called DataDB.get_main_level()
    Called Lexicon.get_level_name(None)
    Called Logger.log_message('testlog', 'info', 'Importing data...')
    Called DataDB.drop_id(u'stand1')
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'simulation': [(datetime.date(2009, 1, 6), {'oid': 'simulation', 'values': [], 'id': 'simulation', 'parent id': None})]},
        0,
        0)
    Called DataDB.add_data_from_dictionary(
        {'comp_unit': [(datetime.date(2009, 1, 6), {'oid': u'stand1', 'values': [('DEV_CLASS', 1), ('ORIG_DC', 1.0), ('MAIN_GROUP', 1), ('SOMETHING_ELSE', 99), ('Inventory_date', 733413), ('USE_RESTRICTION_SILVIC', '0'), ('USE_RESTRICTION_HARVEST', '0')], 'id': u'stand1', 'parent id': 'simulation'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 1, 6), {'parent level': 1, 'oid': u'stratum1_1', 'values': [('BA', 200.0), ('BT', u'test')], 'id': u'stand1-stratum1_1', 'parent id': u'stand1'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 1, 6), {'parent level': 1, 'oid': u'stratum1_2', 'values': [('BA', 22.0), ('BT', u'piece')], 'id': u'stand1-stratum1_2', 'parent id': u'stand1'})]},
        0,
        0)
    Called DataDB.drop_id(u'stand2')
    Called Logger.log_message(
        'testlog',
        'error',
        u'REJECTING: comp_unit stand2; MAIN_GROUP (4) in [4, 5, 6, 7, 8]')
    Called DataDB.row_count('simulation')
    Called Logger.log_message(
        'testlog',
        'info',
        'In total 2 simulation units processed')
    Called Logger.log_message(
        'testlog',
        'info',
        'In total 1 simulation units imported')
    Called DataDB.db.commit()
    Called DataDB.db.vacuum_analyze()
    False

Importing data in 'by_level' format; i.e., each data level has its' own file::

    >>> imp.import_data('by_level', by_level, 'simulation')
    ...     # doctest: +NORMALIZE_WHITESPACE
    ...     # doctest: +ELLIPSIS
    Called DataDB.get_main_level()
    Called Lexicon.get_level_name(None)
    Called Logger.log_message('testlog', 'info', 'Importing data...')
    Called DataDB.drop_id(u'stand1')
    Called DataDB.drop_id(u'stand2')
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'comp_unit': [(datetime.date(2009, 1, 1), {'oid': u'stand1', 'values': [('MAIN_GROUP', 1), ('SOMETHING_ELSE', 99), ('Inventory_date', 733408), ('USE_RESTRICTION_SILVIC', '0'), ('USE_RESTRICTION_HARVEST', '0')], 'id': u'stand1', 'parent id': 'simulation'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'comp_unit': [(datetime.date(2009, 12, 31), {'oid': u'stand2', 'values': [('MAIN_GROUP', 1), ('SOMETHING_ELSE', 99), ('Inventory_date', 733772), ('USE_RESTRICTION_SILVIC', '0'), ('USE_RESTRICTION_HARVEST', '0')], 'id': u'stand2', 'parent id': 'simulation'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 1, 1), {'parent level': 1, 'oid': u'stratum1_1', 'values': [('BA', 200.0), ('BT', u'oh')], 'id': u'stand1-stratum1_1', 'parent id': u'stand1'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 1, 1), {'parent level': 1, 'oid': u'stratum1_2', 'values': [('BA', 22.0), ('BT', u'which')], 'id': u'stand1-stratum1_2', 'parent id': u'stand1'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 12, 31), {'parent level': 1, 'oid': u'stratum2_1', 'values': [('BA', 31.0), ('BT', u'is')], 'id': u'stand2-stratum2_1', 'parent id': u'stand2'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 12, 31), {'parent level': 1, 'oid': u'stratum2_2', 'values': [('BA', 1.0), ('BT', u'infact')], 'id': u'stand2-stratum2_2', 'parent id': u'stand2'})]},
        0,
        0)
    Called Logger.log_message(
        'testlog',
        'info',
        'In total 2 simulation units processed')
    Called Logger.log_message(
        'testlog',
        'info',
        'In total 2 simulation units imported')
    Called DataDB.db.commit()
    Called DataDB.db.vacuum_analyze()
    False
    
    >>> imp.errors
    set([])

With skipfirst. If used like here when the first row shoudn't really be skipped, results in orphan lower data level objects in the database. Also tests id generation; the strata for stand2 have missing ids, so they'll get ids 1 and 2::

    >>> imp.import_data('inlined', [inline2], 'simulation', skip_first=True) 
    ...      # doctest: +ELLIPSIS
    ...      # doctest: +NORMALIZE_WHITESPACE
    Called DataDB.get_main_level()
    Called Lexicon.get_level_name(None)
    Called Logger.log_message('testlog', 'info', 'Importing data...')
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(None, {'parent level': 1, 'oid': u'stratum1_1', 'values': [('BA', 200.0), ('BT', u'pretty')], 'id': u'stratum1_1', 'parent id': None})]},
        0,
        0)
    Called DataDB.drop_id(u'stand2')
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(None, {'parent level': 1, 'oid': u'stratum1_2', 'values': [('BA', 22.0), ('BT', u'frekin')], 'id': u'stratum1_2', 'parent id': None})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'comp_unit': [(datetime.date(2009, 12, 31), {'oid': u'stand2', 'values': [('MAIN_GROUP', 1), ('SOMETHING_ELSE', 99), ('Inventory_date', 733772), ('USE_RESTRICTION_SILVIC', '0'), ('USE_RESTRICTION_HARVEST', '0')], 'id': u'stand2', 'parent id': 'simulation'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 12, 31), {'parent level': 1, 'oid': '1', 'values': [('BA', 31.0), ('BT', u'hmm')], 'id': u'stand2-1', 'parent id': u'stand2'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 12, 31), {'parent level': 1, 'oid': '2', 'values': [('BA', 1.0), ('BT', u'wait')], 'id': u'stand2-2', 'parent id': u'stand2'})]},
        0,
        0)
    Called Logger.log_message(
        'testlog',
        'info',
        'In total 1 simulation units processed')
    Called Logger.log_message(
        'testlog',
        'info',
        'In total 1 simulation units imported')
    Called DataDB.db.commit()
    Called DataDB.db.vacuum_analyze()
    False

Specifying a separator to be used instead of the default whitespace::

    >>> imp.import_data('inlined', [inline3], 'simulation', separator=';') 
    ...      # doctest: +ELLIPSIS
    ...      # doctest: +NORMALIZE_WHITESPACE
    Called...
        'In total 2 simulation units imported')
    Called DataDB.db.commit()
    Called DataDB.db.vacuum_analyze()
    False

By level import for only one, not top level, level with the given data date::

    >>> from datetime import date
    >>> data_date = date(2009, 5, 6)
    >>> imp.import_data('by_level', by_level2, 'simulation', level_ind=[1],
    ...                 data_date=data_date, clear_old=False) 
    ...      # doctest: +ELLIPSIS
    ...      # doctest: +NORMALIZE_WHITESPACE
    Called DataDB.get_main_level()
    Called Lexicon.get_level_name(None)
    Called Logger.log_message('testlog', 'info', 'Importing data...')
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 5, 6), {'parent level': 1, 'oid': u'stratum1_1', 'values': [('BA', 200.0), ('BT', u'oh')], 'id': u'stand1-stratum1_1', 'parent id': u'stand1'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 5, 6), {'parent level': 1, 'oid': u'stratum1_2', 'values': [('BA', 22.0), ('BT', u'which')], 'id': u'stand1-stratum1_2', 'parent id': u'stand1'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 5, 6), {'parent level': 1, 'oid': u'stratum2_1', 'values': [('BA', 31.0), ('BT', u'is')], 'id': u'stand2-stratum2_1', 'parent id': u'stand2'})]},
        0,
        0)
    Called DataDB.row_count('simulation')
    Called DataDB.add_data_from_dictionary(
        {'stratum': [(datetime.date(2009, 5, 6), {'parent level': 1, 'oid': u'stratum2_2', 'values': [('BA', 1.0), ('BT', u'infact')], 'id': u'stand2-stratum2_2', 'parent id': u'stand2'})]},
        0,
        0)
    Called Logger.log_message(
        'testlog',
        'info',
        'In total 0 simulation units processed')
    Called Logger.log_message(
        'testlog',
        'info',
        'In total 0 simulation units imported')
    Called DataDB.db.commit()
    Called DataDB.db.vacuum_analyze()
    False


def _construct_unique_id(self, lind, oid, pid, bottom_level):
============================================================

Construct a unique id for a top-level stand ::

    >>> imp._construct_unique_id(1, '1', 'simulation', False)
    '1'

Try to construct unique id with an invalid call ::

    >>> imp._construct_unique_id(2, '1', 'stratum1_2', True)
    Called Logger.log_message(
        'testlog',
        'error',
        "no parent path available from 'stratum' to 'stratum'!")
    'stratum1_2-1'

Reset oids, which would happen when calling import_data, as otherwise the following call generates an error, which is should not do ::

    >>> imp.oids = {}

Construct unique id for bottom level stratum ::

    >>> imp._construct_unique_id(2, '1', 'stand1', True)
    'stand1-1'


def _parse_date(self, datestr):
===============================

Parse a date string into a datetime object. ::

    >>> dates = ['230209', '23.07.09', '23-07-09', '23/07/09',
    ...          '23072009', '23.07.2009', '23-07-2009', '23/07/2009',
    ...          '2009-07-23', 'fail']
    >>> [imp._parse_date(date) for date in dates] #doctest: +NORMALIZE_WHITESPACE
    Called Logger.log_message('testlog', 'error', "invalid date format 'fail'")
    [datetime.date(2009, 2, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), None]

Parse dates with month-first order ::

    >>> imp.month_first = True
    >>> dates = ['022309', '07.23.09', '07-23-09', '07/23/09',
    ...          '07232009', '07.23.2009', '07-23-2009', '07/23/2009',
    ...          '2009-07-23', 'fail']
    >>> [imp._parse_date(date) for date in dates] #doctest: +NORMALIZE_WHITESPACE
    [datetime.date(2009, 2, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), None]


def _split_row(self, line, sep):
================================

Checks that the current line is valid and then splits the line with the given separator

Parameters ::

    line -- input data line, string
    sep -- column separator, string or None

Split some valid rows ::

    >>> imp._split_row('1;2;3;4;5', ';')
    [u'1', u'2', u'3', u'4', u'5']
    >>> imp._split_row('1 2 3 4 5', ' ')
    [u'1', u'2', u'3', u'4', u'5']
    >>> imp._split_row('1  2  3  4  5', ' ')
    [u'1', u'', u'2', u'', u'3', u'', u'4', u'', u'5']
    >>> imp._split_row('1 2 3 4 5', None)
    [u'1', u'2', u'3', u'4', u'5']
    >>> imp._split_row('1\t2\t3\t4\t5', '\t')
    [u'1', u'2', u'3', u'4', u'5']

Try to split some rows with mismatching line content and separator ::

    >>> imp._split_row('1\t2\t3\t4\t5', ' ')
    >>> imp._split_row('1 2 3 4 5', '\t')
    >>> imp._split_row('1;2;3;4;5', ' ')
    >>> imp._split_row('1;2;3;4;5', ',')

Split some invalid rows ::

    >>> imp._split_row(' THIS IS AN ERRONEUS ROW   ', '\t')
    >>> imp._split_row(' THIS IS AN ERRONEUS ROW   ', ';')
    >>> imp._split_row('    ', ' ')

Still, some rows might be invalid, but impossible to block ::

    >>> imp._split_row(' THIS IS AN ERRONEUS ROW   ', ' ')
    [u'', u'THIS', u'IS', u'AN', u'ERRONEUS', u'ROW', u'', u'', u'']

Split a row with some unicode as ascii ::

    >>> imp._split_row('Asdf;V\xc3\xa4\xc3\xa4n\xc3\xa4nen', ';')
    [u'Asdf', u'V\xe4\xe4n\xe4nen']

Split a row with some iso-8859-1 as ascii WITHOUT the encoding ::

    >>> imp._split_row('Asdf;V\xe4\xe4n\xe4nen', ';')
    Called Logger.log_message(
        'testlog',
        'error',
        'Failed to decode import value V??n?nen')
    [u'Asdf', u'V\ufffd\ufffdn\ufffdnen']

And then after adding that encoding ::

    >>> imp.encodings = ['utf8', 'iso-8859-1']
    >>> imp._split_row('Asdf;V\xe4\xe4n\xe4nen', ';')
    [u'Asdf', u'V\xe4\xe4n\xe4nen']