.. _importdata-py: ############# importdata.py ############# *************************** class DataImporter(object): *************************** Importing data from 'inlined' format; i.e. the data for each level in one file:: >>> from simo.input.importdata import DataImporter >>> execfile('input/test/mock4importdata.py') >>> #from simo.input.test.mock4importdata import * >>> imp = DataImporter(inputdb, mapping, importdate, ... logger, logname, lexicon, 100) >>> imp.import_data('inlined', [inline], 'simulation') ... # doctest: +ELLIPSIS ... # doctest: +NORMALIZE_WHITESPACE Called DataDB.get_main_level() Called Lexicon.get_level_name(None) Called Logger.log_message('testlog', 'info', 'Importing data...') Called DataDB.drop_id(u'stand1') Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'simulation': [(datetime.date(2009, 1, 6), {'oid': 'simulation', 'values': [], 'id': 'simulation', 'parent id': None})]}, 0, 0) Called DataDB.add_data_from_dictionary( {'comp_unit': [(datetime.date(2009, 1, 6), {'oid': u'stand1', 'values': [('DEV_CLASS', 1), ('ORIG_DC', 1.0), ('MAIN_GROUP', 1), ('SOMETHING_ELSE', 99), ('Inventory_date', 733413), ('USE_RESTRICTION_SILVIC', '0'), ('USE_RESTRICTION_HARVEST', '0')], 'id': u'stand1', 'parent id': 'simulation'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 1, 6), {'parent level': 1, 'oid': u'stratum1_1', 'values': [('BA', 200.0), ('BT', u'test')], 'id': u'stand1-stratum1_1', 'parent id': u'stand1'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 1, 6), {'parent level': 1, 'oid': u'stratum1_2', 'values': [('BA', 22.0), ('BT', u'piece')], 'id': u'stand1-stratum1_2', 'parent id': u'stand1'})]}, 0, 0) Called DataDB.drop_id(u'stand2') Called Logger.log_message( 'testlog', 'error', u'REJECTING: comp_unit stand2; MAIN_GROUP (4) in [4, 5, 6, 7, 8]') Called DataDB.row_count('simulation') Called Logger.log_message( 'testlog', 'info', 'In total 2 simulation units processed') Called Logger.log_message( 'testlog', 'info', 'In total 1 simulation units imported') Called DataDB.db.commit() Called DataDB.db.vacuum_analyze() False Importing data in 'by_level' format; i.e., each data level has its' own file:: >>> imp.import_data('by_level', by_level, 'simulation') ... # doctest: +NORMALIZE_WHITESPACE ... # doctest: +ELLIPSIS Called DataDB.get_main_level() Called Lexicon.get_level_name(None) Called Logger.log_message('testlog', 'info', 'Importing data...') Called DataDB.drop_id(u'stand1') Called DataDB.drop_id(u'stand2') Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'comp_unit': [(datetime.date(2009, 1, 1), {'oid': u'stand1', 'values': [('MAIN_GROUP', 1), ('SOMETHING_ELSE', 99), ('Inventory_date', 733408), ('USE_RESTRICTION_SILVIC', '0'), ('USE_RESTRICTION_HARVEST', '0')], 'id': u'stand1', 'parent id': 'simulation'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'comp_unit': [(datetime.date(2009, 12, 31), {'oid': u'stand2', 'values': [('MAIN_GROUP', 1), ('SOMETHING_ELSE', 99), ('Inventory_date', 733772), ('USE_RESTRICTION_SILVIC', '0'), ('USE_RESTRICTION_HARVEST', '0')], 'id': u'stand2', 'parent id': 'simulation'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 1, 1), {'parent level': 1, 'oid': u'stratum1_1', 'values': [('BA', 200.0), ('BT', u'oh')], 'id': u'stand1-stratum1_1', 'parent id': u'stand1'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 1, 1), {'parent level': 1, 'oid': u'stratum1_2', 'values': [('BA', 22.0), ('BT', u'which')], 'id': u'stand1-stratum1_2', 'parent id': u'stand1'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 12, 31), {'parent level': 1, 'oid': u'stratum2_1', 'values': [('BA', 31.0), ('BT', u'is')], 'id': u'stand2-stratum2_1', 'parent id': u'stand2'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 12, 31), {'parent level': 1, 'oid': u'stratum2_2', 'values': [('BA', 1.0), ('BT', u'infact')], 'id': u'stand2-stratum2_2', 'parent id': u'stand2'})]}, 0, 0) Called Logger.log_message( 'testlog', 'info', 'In total 2 simulation units processed') Called Logger.log_message( 'testlog', 'info', 'In total 2 simulation units imported') Called DataDB.db.commit() Called DataDB.db.vacuum_analyze() False >>> imp.errors set([]) With skipfirst. If used like here when the first row shoudn't really be skipped, results in orphan lower data level objects in the database. Also tests id generation; the strata for stand2 have missing ids, so they'll get ids 1 and 2:: >>> imp.import_data('inlined', [inline2], 'simulation', skip_first=True) ... # doctest: +ELLIPSIS ... # doctest: +NORMALIZE_WHITESPACE Called DataDB.get_main_level() Called Lexicon.get_level_name(None) Called Logger.log_message('testlog', 'info', 'Importing data...') Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(None, {'parent level': 1, 'oid': u'stratum1_1', 'values': [('BA', 200.0), ('BT', u'pretty')], 'id': u'stratum1_1', 'parent id': None})]}, 0, 0) Called DataDB.drop_id(u'stand2') Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(None, {'parent level': 1, 'oid': u'stratum1_2', 'values': [('BA', 22.0), ('BT', u'frekin')], 'id': u'stratum1_2', 'parent id': None})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'comp_unit': [(datetime.date(2009, 12, 31), {'oid': u'stand2', 'values': [('MAIN_GROUP', 1), ('SOMETHING_ELSE', 99), ('Inventory_date', 733772), ('USE_RESTRICTION_SILVIC', '0'), ('USE_RESTRICTION_HARVEST', '0')], 'id': u'stand2', 'parent id': 'simulation'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 12, 31), {'parent level': 1, 'oid': '1', 'values': [('BA', 31.0), ('BT', u'hmm')], 'id': u'stand2-1', 'parent id': u'stand2'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 12, 31), {'parent level': 1, 'oid': '2', 'values': [('BA', 1.0), ('BT', u'wait')], 'id': u'stand2-2', 'parent id': u'stand2'})]}, 0, 0) Called Logger.log_message( 'testlog', 'info', 'In total 1 simulation units processed') Called Logger.log_message( 'testlog', 'info', 'In total 1 simulation units imported') Called DataDB.db.commit() Called DataDB.db.vacuum_analyze() False Specifying a separator to be used instead of the default whitespace:: >>> imp.import_data('inlined', [inline3], 'simulation', separator=';') ... # doctest: +ELLIPSIS ... # doctest: +NORMALIZE_WHITESPACE Called... 'In total 2 simulation units imported') Called DataDB.db.commit() Called DataDB.db.vacuum_analyze() False By level import for only one, not top level, level with the given data date:: >>> from datetime import date >>> data_date = date(2009, 5, 6) >>> imp.import_data('by_level', by_level2, 'simulation', level_ind=[1], ... data_date=data_date, clear_old=False) ... # doctest: +ELLIPSIS ... # doctest: +NORMALIZE_WHITESPACE Called DataDB.get_main_level() Called Lexicon.get_level_name(None) Called Logger.log_message('testlog', 'info', 'Importing data...') Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 5, 6), {'parent level': 1, 'oid': u'stratum1_1', 'values': [('BA', 200.0), ('BT', u'oh')], 'id': u'stand1-stratum1_1', 'parent id': u'stand1'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 5, 6), {'parent level': 1, 'oid': u'stratum1_2', 'values': [('BA', 22.0), ('BT', u'which')], 'id': u'stand1-stratum1_2', 'parent id': u'stand1'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 5, 6), {'parent level': 1, 'oid': u'stratum2_1', 'values': [('BA', 31.0), ('BT', u'is')], 'id': u'stand2-stratum2_1', 'parent id': u'stand2'})]}, 0, 0) Called DataDB.row_count('simulation') Called DataDB.add_data_from_dictionary( {'stratum': [(datetime.date(2009, 5, 6), {'parent level': 1, 'oid': u'stratum2_2', 'values': [('BA', 1.0), ('BT', u'infact')], 'id': u'stand2-stratum2_2', 'parent id': u'stand2'})]}, 0, 0) Called Logger.log_message( 'testlog', 'info', 'In total 0 simulation units processed') Called Logger.log_message( 'testlog', 'info', 'In total 0 simulation units imported') Called DataDB.db.commit() Called DataDB.db.vacuum_analyze() False def _construct_unique_id(self, lind, oid, pid, bottom_level): ============================================================ Construct a unique id for a top-level stand :: >>> imp._construct_unique_id(1, '1', 'simulation', False) '1' Try to construct unique id with an invalid call :: >>> imp._construct_unique_id(2, '1', 'stratum1_2', True) Called Logger.log_message( 'testlog', 'error', "no parent path available from 'stratum' to 'stratum'!") 'stratum1_2-1' Reset oids, which would happen when calling import_data, as otherwise the following call generates an error, which is should not do :: >>> imp.oids = {} Construct unique id for bottom level stratum :: >>> imp._construct_unique_id(2, '1', 'stand1', True) 'stand1-1' def _parse_date(self, datestr): =============================== Parse a date string into a datetime object. :: >>> dates = ['230209', '23.07.09', '23-07-09', '23/07/09', ... '23072009', '23.07.2009', '23-07-2009', '23/07/2009', ... '2009-07-23', 'fail'] >>> [imp._parse_date(date) for date in dates] #doctest: +NORMALIZE_WHITESPACE Called Logger.log_message('testlog', 'error', "invalid date format 'fail'") [datetime.date(2009, 2, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), None] Parse dates with month-first order :: >>> imp.month_first = True >>> dates = ['022309', '07.23.09', '07-23-09', '07/23/09', ... '07232009', '07.23.2009', '07-23-2009', '07/23/2009', ... '2009-07-23', 'fail'] >>> [imp._parse_date(date) for date in dates] #doctest: +NORMALIZE_WHITESPACE [datetime.date(2009, 2, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), datetime.date(2009, 7, 23), None] def _split_row(self, line, sep): ================================ Checks that the current line is valid and then splits the line with the given separator Parameters :: line -- input data line, string sep -- column separator, string or None Split some valid rows :: >>> imp._split_row('1;2;3;4;5', ';') [u'1', u'2', u'3', u'4', u'5'] >>> imp._split_row('1 2 3 4 5', ' ') [u'1', u'2', u'3', u'4', u'5'] >>> imp._split_row('1 2 3 4 5', ' ') [u'1', u'', u'2', u'', u'3', u'', u'4', u'', u'5'] >>> imp._split_row('1 2 3 4 5', None) [u'1', u'2', u'3', u'4', u'5'] >>> imp._split_row('1\t2\t3\t4\t5', '\t') [u'1', u'2', u'3', u'4', u'5'] Try to split some rows with mismatching line content and separator :: >>> imp._split_row('1\t2\t3\t4\t5', ' ') >>> imp._split_row('1 2 3 4 5', '\t') >>> imp._split_row('1;2;3;4;5', ' ') >>> imp._split_row('1;2;3;4;5', ',') Split some invalid rows :: >>> imp._split_row(' THIS IS AN ERRONEUS ROW ', '\t') >>> imp._split_row(' THIS IS AN ERRONEUS ROW ', ';') >>> imp._split_row(' ', ' ') Still, some rows might be invalid, but impossible to block :: >>> imp._split_row(' THIS IS AN ERRONEUS ROW ', ' ') [u'', u'THIS', u'IS', u'AN', u'ERRONEUS', u'ROW', u'', u'', u''] Split a row with some unicode as ascii :: >>> imp._split_row('Asdf;V\xc3\xa4\xc3\xa4n\xc3\xa4nen', ';') [u'Asdf', u'V\xe4\xe4n\xe4nen'] Split a row with some iso-8859-1 as ascii WITHOUT the encoding :: >>> imp._split_row('Asdf;V\xe4\xe4n\xe4nen', ';') Called Logger.log_message( 'testlog', 'error', 'Failed to decode import value V??n?nen') [u'Asdf', u'V\ufffd\ufffdn\ufffdnen'] And then after adding that encoding :: >>> imp.encodings = ['utf8', 'iso-8859-1'] >>> imp._split_row('Asdf;V\xe4\xe4n\xe4nen', ';') [u'Asdf', u'V\xe4\xe4n\xe4nen']