From 846cc743459b55f8870ea16e339967f6f96c268d Mon Sep 17 00:00:00 2001 From: Timi Fasubaa Date: Mon, 9 Oct 2017 15:01:21 -0700 Subject: [PATCH 1/8] add upload csv button to sources dropdown --- superset/views/core.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/superset/views/core.py b/superset/views/core.py index bebf17fd237cf..603684ec81026 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -2465,6 +2465,16 @@ class CssTemplateAsyncModelView(CssTemplateModelView): category_label=__('SQL Lab'), ) +appbuilder.add_link( + 'Upload a CSV', + label=__("Upload a CSV"), + href='/csvtodatabaseview/form', + icon="fa-upload", + category='Sources', + category_label=__("Sources"), + category_icon='fa-wrench',) +appbuilder.add_separator("Sources") + @app.after_request def apply_caching(response): From 7e4f1d7b9d102a6846091dbe0ef09dc5c0a7f45b Mon Sep 17 00:00:00 2001 From: Timi Fasubaa Date: Mon, 9 Oct 2017 16:07:56 -0700 Subject: [PATCH 2/8] upload csv to non-hive datasources --- superset/db_engine_specs.py | 83 ++++++++++++++++- superset/forms.py | 180 ++++++++++++++++++++++++++++++++++++ superset/views/core.py | 68 +++++++++++++- 3 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 superset/forms.py diff --git a/superset/db_engine_specs.py b/superset/db_engine_specs.py index 4b66fa1c33ae8..c19867e39772a 100644 --- a/superset/db_engine_specs.py +++ b/superset/db_engine_specs.py @@ -22,14 +22,19 @@ import re import textwrap import time +import os +import pandas from flask_babel import lazy_gettext as _ +from flask import g, flash from sqlalchemy import select +from sqlalchemy.engine import create_engine from sqlalchemy.engine.url import make_url from sqlalchemy.sql import text import sqlparse +from werkzeug.utils import secure_filename -from superset import cache_util, conf, utils +from superset import app, cache_util, conf, db, utils from superset.utils import QueryStatus, SupersetTemplateException tracking_url_trans = conf.get('TRACKING_URL_TRANSFORMER') @@ -73,6 +78,82 @@ def extra_table_metadata(cls, database, table_name, schema_name): """Returns engine-specific table metadata""" return {} + @staticmethod + def csv_to_df(**kwargs): + kwargs['filepath_or_buffer'] = \ + app.config['UPLOAD_FOLDER'] + kwargs['filepath_or_buffer'] + kwargs['encoding'] = 'utf-8' + kwargs['iterator'] = True + chunks = pandas.read_csv(**kwargs) + df = pandas.DataFrame() + df = pandas.concat(chunk for chunk in chunks) + return df + + @staticmethod + def df_to_db(**kwargs): + df = kwargs['df'] + table = kwargs['table'] + del kwargs['df'] + del kwargs['table'] + kwargs['con'] = create_engine(kwargs['con'], echo=False) + df.to_sql(**kwargs) + + table.user_id = g.user.id + table.schema = kwargs['schema'] + db.session.add(table) + db.session.commit() + + @staticmethod + def upload_csv(form, table): + def allowed_file(filename): + # Only allow specific file extensions as specified in the config + extension = os.path.splitext(filename)[1] + return extension and extension[1:] in app.config['ALLOWED_EXTENSIONS'] + + filename = secure_filename(form.csv_file.data.filename) + if not allowed_file(filename): + flash("Invalid file type selected.", 'alert') + return False + kwargs = { + 'names': form.names.data if form.names.data else None, + 'filepath_or_buffer': filename, + 'sep': form.sep.data, + 'header': form.header.data, + 'index_col': form.index_col.data, + 'squeeze': form.squeeze.data, + 'prefix': form.prefix.data, + 'mangle_dupe_cols': form.mangle_dupe_cols.data, + 'skipinitialspace': form.skipinitialspace.data, + 'skiprows': form.skiprows.data, + 'nrows': form.nrows.data, + 'skip_blank_lines': form.skip_blank_lines.data, + 'parse_dates': form.parse_dates.data, + 'infer_datetime_format': form.infer_datetime_format.data, + 'dayfirst': form.dayfirst.data, + 'thousands': form.thousands.data, + 'decimal': form.decimal.data, + 'quotechar': form.quotechar.data, + 'escapechar': form.escapechar.data, + 'comment': form.comment.data, + 'error_bad_lines': form.error_bad_lines.data, + 'chunksize': 10000, + } + df = BaseEngineSpec.csv_to_df(**kwargs) + + df_to_db_kwargs = { + 'table': table, + 'df': df, + 'name': form.name.data, + 'con': form.con.data, + 'schema': form.schema.data, + 'if_exists': form.if_exists.data, + 'index': form.index.data, + 'index_label': form.index_label.data, + 'chunksize': 10000, + } + BaseEngineSpec.df_to_db(**df_to_db_kwargs) + return True + @classmethod def escape_sql(cls, sql): """Escapes the raw SQL""" diff --git a/superset/forms.py b/superset/forms.py new file mode 100644 index 0000000000000..21f471cfb4bd8 --- /dev/null +++ b/superset/forms.py @@ -0,0 +1,180 @@ +"""Contains the logic to create cohesive forms on the explore view""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from flask_babel import lazy_gettext as _ +from flask_appbuilder.fieldwidgets import BS3TextFieldWidget +from flask_appbuilder.forms import DynamicForm +from flask_wtf.file import FileField, FileAllowed, FileRequired +from wtforms import ( + BooleanField, SelectField, IntegerField, StringField) +from wtforms.validators import DataRequired, Optional, NumberRange + +from superset import app + +config = app.config + + +class CsvToDatabaseForm(DynamicForm): + name = StringField( + _('Table Name'), + description=_('Name of table to be created from csv data.'), + validators=[DataRequired()], + widget=BS3TextFieldWidget()) + csv_file = FileField( + _('CSV File'), + description=_('Select a CSV file to be uploaded to a database.'), + validators=[ + FileRequired(), FileAllowed(['csv'], _('CSV Files Only!'))]) + + con = SelectField( + _('Database URI'), + description=_('URI of database in which to add above table.'), + validators=[DataRequired()], + choices=[]) + if_exists = SelectField( + _('Table Exists'), + description=_( + 'If table exists do one of the following: ' + 'Fail (do nothing), Replace (drop and recreate table) ' + 'or Append (insert data).'), + choices=[ + ('fail', _('Fail')), ('replace', _('Replace')), + ('append', _('Append'))], + validators=[DataRequired()]) + + schema = StringField( + _('Schema'), + description=_('Specify a schema (if database flavour supports this).'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + sep = StringField( + _('Delimiter'), + description=_('Delimiter used by CSV file (for whitespace use \s+).'), + validators=[DataRequired()], + widget=BS3TextFieldWidget()) + header = IntegerField( + _('Header Row'), + description=_( + 'Row containing the headers to use as ' + 'column names (0 is first line of data). ' + 'Leave empty if there is no header row.'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + names = StringField( + _('Column Names'), + description=_( + 'List of comma-separated column names to use if ' + 'header row not specified above. Leave empty if header ' + 'field populated.'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + index_col = IntegerField( + _('Index Column'), + description=_( + 'Column to use as the row labels of the ' + 'dataframe. Leave empty if no index column.'), + validators=[Optional(), NumberRange(0, 1E+20)], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + squeeze = BooleanField( + _('Squeeze'), + description=_( + 'Parse the data as a series (specify ' + 'this option if the data contains only one column.)')) + prefix = StringField( + _('Prefix'), + description=_( + 'Prefix to add to column numbers when no header ' + '(e.g. "X" for "X0, X1").'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + mangle_dupe_cols = BooleanField( + _('Mangle Duplicate Columns'), + description=_('Specify duplicate columns as "X.0, X.1".')) + skipinitialspace = BooleanField( + _('Skip Initial Space'), + description=_('Skip spaces after delimiter.')) + skiprows = IntegerField( + _('Skip Rows'), + description=_('Number of rows to skip at start of file.'), + validators=[Optional(), NumberRange(0, 1E+20)], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + nrows = IntegerField( + _('Rows to Read'), + description=_('Number of rows of file to read.'), + validators=[Optional(), NumberRange(0, 1E+20)], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + skip_blank_lines = BooleanField( + _('Skip Blank Lines'), + description=_( + 'Skip blank lines rather than interpreting them ' + 'as NaN values.')) + parse_dates = BooleanField( + _('Parse Dates'), + description=_('Parse date values.')) + infer_datetime_format = BooleanField( + _('Infer Datetime Format'), + description=_( + 'Use Pandas to interpret the datetime format ' + 'automatically.')) + dayfirst = BooleanField( + _('Day First'), + description=_( + 'Use DD/MM (European/International) date format.')) + thousands = StringField( + _('Thousands Separator'), + description=_('Separator for values in thousands.'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + decimal = StringField( + _('Decimal Character'), + description=_('Character to interpret as decimal point.'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or '.']) + quotechar = StringField( + _('Quote Character'), + description=_( + 'Character used to denote the start and end of a quoted item.'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or "'"]) + escapechar = StringField( + _('Escape Character'), + description=_('Character used to escape a quoted item.'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + comment = StringField( + _('Comment Character'), + description=_('Character used to denote the start of a comment.'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) + error_bad_lines = BooleanField( + _('Error On Bad Lines'), + description=_( + 'Error on bad lines (e.g. a line with ' + 'too many commas). If false these bad lines will instead ' + 'be dropped from the resulting dataframe.')) + index = BooleanField( + _('Dataframe Index'), + description=_('Write dataframe index as a column.')) + index_label = StringField( + _('Column Label(s)'), + description=_( + 'Column label for index column(s). If None is given ' + 'and Dataframe Index is True, Index Names are used.'), + validators=[Optional()], + widget=BS3TextFieldWidget(), + filters=[lambda x: x or None]) diff --git a/superset/views/core.py b/superset/views/core.py index 603684ec81026..09be9cf687640 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -7,16 +7,17 @@ from datetime import datetime, timedelta import json import logging +import os import pickle import re import time import traceback from urllib import parse - from flask import ( flash, g, Markup, redirect, render_template, request, Response, url_for, ) -from flask_appbuilder import expose +from flask_appbuilder import expose, SimpleFormView + from flask_appbuilder.actions import action from flask_appbuilder.models.sqla.interface import SQLAInterface from flask_appbuilder.security.decorators import has_access_api @@ -28,17 +29,21 @@ from sqlalchemy import create_engine from sqlalchemy.engine.url import make_url from werkzeug.routing import BaseConverter +from werkzeug.utils import secure_filename from superset import ( app, appbuilder, cache, db, results_backend, security, sm, sql_lab, utils, viz, ) from superset.connectors.connector_registry import ConnectorRegistry +from superset.connectors.sqla.models import SqlaTable +from superset.forms import CsvToDatabaseForm from superset.legacy import cast_form_data import superset.models.core as models from superset.models.sql_lab import Query from superset.sql_parse import SupersetQuery from superset.utils import has_access, merge_extra_filters, QueryStatus + from .base import ( api, BaseSupersetView, CsvResponse, DeleteMixin, get_error_msg, get_user_roles, json_error_response, SupersetFilter, SupersetModelView, @@ -314,6 +319,65 @@ class DatabaseAsync(DatabaseView): appbuilder.add_view_no_menu(DatabaseAsync) +class CsvToDatabaseView(SimpleFormView): + form = CsvToDatabaseForm + form_title = _('CSV to Database configuration') + add_columns = ['database', 'schema', 'table_name'] + + def form_get(self, form): + form.sep.data = ',' + form.header.data = 0 + form.squeeze.data = False + form.names.data = None + form.mangle_dupe_cols.data = True + form.skipinitialspace.data = False + form.skip_blank_lines.data = True + form.parse_dates.data = True + form.infer_datetime_format.data = True + form.dayfirst.data = False + form.decimal.data = '.' + form.error_bad_lines.data = False + form.if_exists.data = 'append' + all_datasources = db.session.query( + models.Database.sqlalchemy_uri, + models.Database.database_name)\ + .all() + form.con.choices = all_datasources + + def form_post(self, form): + def upload_file(csv_file): + if csv_file and csv_file.filename: + filename = secure_filename(csv_file.filename) + csv_file.save(os.path.join(config['UPLOAD_FOLDER'], filename)) + return filename + + form.names.data = form.names.data.split(",") if form.names.data else None + database = ( + db.session.query(models.Database) + .filter_by(sqlalchemy_uri=form.data.get('con')) + .one() + ) + upload_file(form.csv_file.data) + table = SqlaTable(table_name=form.name.data) + table.database_id = database.id + table.database = database + successful = database.db_engine_spec.upload_csv(form, table) + if successful: + # Go back to welcome page / splash screen + db_name = db.session.query(models.Database.database_name)\ + .filter_by(sqlalchemy_uri=form.data.get('con')).one() + + message = _('CSV file "{0}" uploaded to table "{1}" in ' + 'database "{2}"'.format(form.csv_file.data.filename, + form.name.data, + db_name[0])) + flash(message, 'info') + return redirect('/tablemodelview/list/') + + +appbuilder.add_view_no_menu(CsvToDatabaseView) + + class DatabaseTablesAsync(DatabaseView): list_columns = ['id', 'all_table_names', 'all_schema_names'] From ac790c059b2f9ca1d9c52ee4e12ac1c45101724d Mon Sep 17 00:00:00 2001 From: Timi Fasubaa Date: Mon, 9 Oct 2017 16:10:51 -0700 Subject: [PATCH 3/8] upload csv to hive datasource --- superset/config.py | 11 ++++++++ superset/db_engine_specs.py | 54 +++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/superset/config.py b/superset/config.py index 2d168a9b8d642..7bc4391840209 100644 --- a/superset/config.py +++ b/superset/config.py @@ -186,6 +186,10 @@ ENABLE_CORS = False CORS_OPTIONS = {} +# Allowed format types for upload on Database view +# TODO: Add processing of other spreadsheet formats (xls, xlsx etc) +ALLOWED_EXTENSIONS = set(['csv']) + # CSV Options: key/value pairs that will be passed as argument to DataFrame.to_csv method # note: index option should not be overridden CSV_EXPORT = { @@ -296,6 +300,13 @@ class CeleryConfig(object): # in SQL Lab by using the "Run Async" button/feature RESULTS_BACKEND = None +#The S3 bucket where you want to store your external hive tables created +#from CSV files. For example, 'companyname-superset' +CSV_TO_HIVE_UPLOAD_S3_BUCKET = None + +#The directory within the bucket specified that will contain all the external tables +CSV_TO_HIVE_UPLOAD_DIRECTORY = "EXTERNAL_HIVE_TABLES/" + # A dictionary of items that gets merged into the Jinja context for # SQL Lab. The existing context gets updated with this dictionary, # meaning values for existing keys get overwritten by the content of this diff --git a/superset/db_engine_specs.py b/superset/db_engine_specs.py index c19867e39772a..91a63787ae267 100644 --- a/superset/db_engine_specs.py +++ b/superset/db_engine_specs.py @@ -24,6 +24,8 @@ import time import os +import csv +import boto3 import pandas from flask_babel import lazy_gettext as _ from flask import g, flash @@ -37,6 +39,8 @@ from superset import app, cache_util, conf, db, utils from superset.utils import QueryStatus, SupersetTemplateException +config = app.config + tracking_url_trans = conf.get('TRACKING_URL_TRANSFORMER') Grain = namedtuple('Grain', 'name label function') @@ -798,6 +802,56 @@ def fetch_result_sets(cls, db, datasource_type, force=False): return BaseEngineSpec.fetch_result_sets( db, datasource_type, force=force) + @staticmethod + def upload_csv(form, table): + """Uploads a csv file and creates a superset datasource in Hive.""" + def get_column_names(filepath): + with open(filepath, "rb") as f: + return csv.reader(f).next() + + table_name = form.name.data + filename = form.csv_file.data.filename + + bucket_path = config["CSV_TO_HIVE_UPLOAD_BUCKET"] + + if not bucket_path: + logging.info("No upload bucket specified") + flash( + "No upload bucket specified. " + "You can specify one in the config file.", + "alert") + return False + + upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY"] + dest_path = os.path.join(table_name, filename) + + upload_path = config['UPLOAD_FOLDER'] + secure_filename( + form.csv_file.data.filename) + column_names = get_column_names(upload_path) + schema_definition = ", ".join( + [s + " STRING " for s in column_names]) + + s3 = boto3.client('s3') + location =\ + os.path.join("s3a://", bucket_path, upload_prefix, table_name) + s3.upload_file( + upload_path, + 'airbnb-superset', + os.path.join(upload_prefix, table_name, filename)) + sql = """CREATE EXTERNAL TABLE {table_name} ( {schema_definition} ) + ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS + TEXTFILE LOCATION '{location}'""".format(**locals()) + try: + logging.info(form.con.data) + engine = create_engine(form.con.data) + engine.execute(sql) + return True + except Exception as e: + logging.exception(e) + logging.info(sql) + flash(BaseEngineSpec.extract_error_message(e), "alert") + return False + @classmethod def convert_dttm(cls, target_type, dttm): tt = target_type.upper() From f352e9dc8f2b4688b0987f4e323cf3e572a5ae92 Mon Sep 17 00:00:00 2001 From: Timi Fasubaa Date: Mon, 9 Oct 2017 16:11:38 -0700 Subject: [PATCH 4/8] update FAQ page --- docs/faq.rst | 7 +++++++ superset/config.py | 9 +++++---- superset/db_engine_specs.py | 13 ++++--------- superset/forms.py | 6 +++--- superset/views/core.py | 5 +++-- 5 files changed, 22 insertions(+), 18 deletions(-) diff --git a/docs/faq.rst b/docs/faq.rst index d825ef5ba0b28..b1f43877a639a 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -45,6 +45,13 @@ visualizations. https://github.com/airbnb/superset/issues?q=label%3Aexample+is%3Aclosed +Can I upload and visualize csv data? +------------------------------------- + +Yes, using the ``Upload a CSV`` button under the ``Sources`` +menu item. This brings up a form that allows you specify required information. After creating the table from CSV, it can then be loadede like any other on the ``Sources -> Tables``page. + + Why are my queries timing out? ------------------------------ diff --git a/superset/config.py b/superset/config.py index 7bc4391840209..fb9bbf4a2f02a 100644 --- a/superset/config.py +++ b/superset/config.py @@ -300,11 +300,12 @@ class CeleryConfig(object): # in SQL Lab by using the "Run Async" button/feature RESULTS_BACKEND = None -#The S3 bucket where you want to store your external hive tables created -#from CSV files. For example, 'companyname-superset' -CSV_TO_HIVE_UPLOAD_S3_BUCKET = None +# The S3 bucket where you want to store your external hive tables created +# from CSV files. For example, 'companyname-superset' +CSV_TO_HIVE_UPLOAD_S3_BUCKET = None -#The directory within the bucket specified that will contain all the external tables +# The directory within the bucket specified above that will +# contain all the external tables CSV_TO_HIVE_UPLOAD_DIRECTORY = "EXTERNAL_HIVE_TABLES/" # A dictionary of items that gets merged into the Jinja context for diff --git a/superset/db_engine_specs.py b/superset/db_engine_specs.py index 91a63787ae267..3940bfc80aca5 100644 --- a/superset/db_engine_specs.py +++ b/superset/db_engine_specs.py @@ -94,12 +94,7 @@ def csv_to_df(**kwargs): return df @staticmethod - def df_to_db(**kwargs): - df = kwargs['df'] - table = kwargs['table'] - del kwargs['df'] - del kwargs['table'] - kwargs['con'] = create_engine(kwargs['con'], echo=False) + def df_to_db(df, table, **kwargs): df.to_sql(**kwargs) table.user_id = g.user.id @@ -148,7 +143,7 @@ def allowed_file(filename): 'table': table, 'df': df, 'name': form.name.data, - 'con': form.con.data, + 'con': create_engine(form.con.data, echo=False), 'schema': form.schema.data, 'if_exists': form.if_exists.data, 'index': form.index.data, @@ -816,9 +811,9 @@ def get_column_names(filepath): if not bucket_path: logging.info("No upload bucket specified") - flash( + flash(_( "No upload bucket specified. " - "You can specify one in the config file.", + "You can specify one in the config file."), "alert") return False diff --git a/superset/forms.py b/superset/forms.py index 21f471cfb4bd8..48ea744c5050e 100644 --- a/superset/forms.py +++ b/superset/forms.py @@ -10,7 +10,7 @@ from flask_wtf.file import FileField, FileAllowed, FileRequired from wtforms import ( BooleanField, SelectField, IntegerField, StringField) -from wtforms.validators import DataRequired, Optional, NumberRange +from wtforms.validators import DataRequired, InputRequired, Optional, NumberRange from superset import app @@ -30,8 +30,8 @@ class CsvToDatabaseForm(DynamicForm): FileRequired(), FileAllowed(['csv'], _('CSV Files Only!'))]) con = SelectField( - _('Database URI'), - description=_('URI of database in which to add above table.'), + _('Database'), + description=_('database in which to add above table.'), validators=[DataRequired()], choices=[]) if_exists = SelectField( diff --git a/superset/views/core.py b/superset/views/core.py index 09be9cf687640..85dfe4a6e60b0 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -342,7 +342,7 @@ def form_get(self, form): models.Database.sqlalchemy_uri, models.Database.database_name)\ .all() - form.con.choices = all_datasources + form.con.choices += all_datasources def form_post(self, form): def upload_file(csv_file): @@ -351,7 +351,8 @@ def upload_file(csv_file): csv_file.save(os.path.join(config['UPLOAD_FOLDER'], filename)) return filename - form.names.data = form.names.data.split(",") if form.names.data else None + form.names.data =\ + form.names.data.split(",") if form.names.data else None database = ( db.session.query(models.Database) .filter_by(sqlalchemy_uri=form.data.get('con')) From abd6ba1f97c02b6adcc677625d4887d4d29e943a Mon Sep 17 00:00:00 2001 From: Timi Fasubaa Date: Thu, 12 Oct 2017 17:48:58 -0700 Subject: [PATCH 5/8] add tests --- tests/core_tests.py | 52 ++++++++++++++++++++++++++++++++++++++++++++- tox.ini | 4 ++-- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/tests/core_tests.py b/tests/core_tests.py index a82106465cea7..40a37f2f9884e 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -9,13 +9,16 @@ import io import json import logging +import os import random +import string import unittest from flask import escape +from sqlalchemy.engine import create_engine import sqlalchemy as sqla -from superset import appbuilder, db, jinja_context, sm, sql_lab, utils +from superset import app, appbuilder, db, jinja_context, sm, sql_lab, utils from superset.connectors.sqla.models import SqlaTable from superset.models import core as models from superset.models.sql_lab import Query @@ -786,6 +789,53 @@ def test_viz_get_fillna_for_columns(self): {'name': ' NULL', 'sum__num': 0}, ) + def test_import_csv(self): + self.login(username='admin') + config = app.config + print(config['SQLALCHEMY_DATABASE_URI']) #add the id mappings! don't go straight to the + filename = "testCSV.csv" + table_name = ''.join( + random.choice(string.ascii_uppercase) for _ in range(5)) + + test_file = open(filename, 'w+') + test_file.write("a,b\n") + test_file.write("john,1\n") + test_file.write("paul,2\n") + test_file.close() + engine = create_engine(config['SQLALCHEMY_DATABASE_URI']) + + main_db_uri = db.session.query( + models.Database.sqlalchemy_uri)\ + .filter_by(database_name="main").all() + + test_file = open(filename, 'rb') + form_data = { + 'csv_file': test_file, + 'sep': ',', + 'name': table_name, + 'con': main_db_uri[0][0], + 'if_exists': 'append', + 'index_label': 'test_label', + 'mangle_dupe_cols': False} + + url = '/databaseview/list/' + add_datasource_page = self.get_resp(url) + assert 'Upload a CSV' in add_datasource_page + + url = '/csvtodatabaseview/form' + form_get = self.get_resp(url) + assert 'CSV to Database configuration' in form_get + + try: + # ensure uploaded successfully + form_post = self.get_resp(url, data=form_data) + if "Not a valid choice" in form_post: + print("Not a valid choice") + return + assert 'CSV file "testCSV.csv" uploaded to table' in form_post + finally: + os.remove(filename) + if __name__ == '__main__': unittest.main() diff --git a/tox.ini b/tox.ini index 78198ea190894..1878fddc915a1 100644 --- a/tox.ini +++ b/tox.ini @@ -68,12 +68,12 @@ commands = [testenv:py27-mysql] basepython = python2.7 setenv = - SUPERSET__SQLALCHEMY_DATABASE_URI = mysql://mysqluser:mysqluserpassword@localhost/superset?charset=utf8 + SUPERSET__SQLALCHEMY_DATABASE_URI = mysql://root@localhost/superset?charset=utf8 [testenv:py34-mysql] basepython = python3.4 setenv = - SUPERSET__SQLALCHEMY_DATABASE_URI = mysql://mysqluser:mysqluserpassword@localhost/superset + SUPERSET__SQLALCHEMY_DATABASE_URI = mysql://root@localhost/superset [testenv:py35-mysql] basepython = python3.5 From abdca6fd75604cfc9f1a4cbf9ef2b62c6e567228 Mon Sep 17 00:00:00 2001 From: Timi Fasubaa Date: Wed, 15 Nov 2017 11:04:26 -0800 Subject: [PATCH 6/8] fix linting errors and merge conflicts --- superset/config.py | 2 +- superset/db_engine_specs.py | 66 +++++++++++++------------------- superset/forms.py | 75 +++++-------------------------------- superset/views/core.py | 30 +++++++-------- tests/core_tests.py | 22 ++++------- 5 files changed, 56 insertions(+), 139 deletions(-) diff --git a/superset/config.py b/superset/config.py index fb9bbf4a2f02a..cea4870a61c36 100644 --- a/superset/config.py +++ b/superset/config.py @@ -306,7 +306,7 @@ class CeleryConfig(object): # The directory within the bucket specified above that will # contain all the external tables -CSV_TO_HIVE_UPLOAD_DIRECTORY = "EXTERNAL_HIVE_TABLES/" +CSV_TO_HIVE_UPLOAD_DIRECTORY = 'EXTERNAL_HIVE_TABLES/' # A dictionary of items that gets merged into the Jinja context for # SQL Lab. The existing context gets updated with this dictionary, diff --git a/superset/db_engine_specs.py b/superset/db_engine_specs.py index 3940bfc80aca5..c1c826ee0c98d 100644 --- a/superset/db_engine_specs.py +++ b/superset/db_engine_specs.py @@ -17,18 +17,18 @@ from __future__ import unicode_literals from collections import defaultdict, namedtuple +import csv import inspect import logging +import os import re import textwrap import time -import os -import csv import boto3 -import pandas +from flask import g from flask_babel import lazy_gettext as _ -from flask import g, flash +import pandas from sqlalchemy import select from sqlalchemy.engine import create_engine from sqlalchemy.engine.url import make_url @@ -96,14 +96,14 @@ def csv_to_df(**kwargs): @staticmethod def df_to_db(df, table, **kwargs): df.to_sql(**kwargs) - table.user_id = g.user.id table.schema = kwargs['schema'] + table.fetch_metadata() db.session.add(table) db.session.commit() @staticmethod - def upload_csv(form, table): + def create_table_from_csv(form, table): def allowed_file(filename): # Only allow specific file extensions as specified in the config extension = os.path.splitext(filename)[1] @@ -111,16 +111,12 @@ def allowed_file(filename): filename = secure_filename(form.csv_file.data.filename) if not allowed_file(filename): - flash("Invalid file type selected.", 'alert') - return False + return (False, 'Invalid file type selected.') kwargs = { - 'names': form.names.data if form.names.data else None, 'filepath_or_buffer': filename, 'sep': form.sep.data, - 'header': form.header.data, + 'header': form.header.data if form.header.data else 0, 'index_col': form.index_col.data, - 'squeeze': form.squeeze.data, - 'prefix': form.prefix.data, 'mangle_dupe_cols': form.mangle_dupe_cols.data, 'skipinitialspace': form.skipinitialspace.data, 'skiprows': form.skiprows.data, @@ -128,13 +124,6 @@ def allowed_file(filename): 'skip_blank_lines': form.skip_blank_lines.data, 'parse_dates': form.parse_dates.data, 'infer_datetime_format': form.infer_datetime_format.data, - 'dayfirst': form.dayfirst.data, - 'thousands': form.thousands.data, - 'decimal': form.decimal.data, - 'quotechar': form.quotechar.data, - 'escapechar': form.escapechar.data, - 'comment': form.comment.data, - 'error_bad_lines': form.error_bad_lines.data, 'chunksize': 10000, } df = BaseEngineSpec.csv_to_df(**kwargs) @@ -151,7 +140,7 @@ def allowed_file(filename): 'chunksize': 10000, } BaseEngineSpec.df_to_db(**df_to_db_kwargs) - return True + return (True, '') @classmethod def escape_sql(cls, sql): @@ -798,40 +787,36 @@ def fetch_result_sets(cls, db, datasource_type, force=False): db, datasource_type, force=force) @staticmethod - def upload_csv(form, table): + def create_table_from_csv(form, table): """Uploads a csv file and creates a superset datasource in Hive.""" def get_column_names(filepath): - with open(filepath, "rb") as f: + with open(filepath, 'rb') as f: return csv.reader(f).next() table_name = form.name.data filename = form.csv_file.data.filename - bucket_path = config["CSV_TO_HIVE_UPLOAD_BUCKET"] + bucket_path = app.config['CSV_TO_HIVE_UPLOAD_BUCKET'] if not bucket_path: - logging.info("No upload bucket specified") - flash(_( - "No upload bucket specified. " - "You can specify one in the config file."), - "alert") - return False + logging.info('No upload bucket specified') + return ( + False, + 'No upload bucket specified. You can specify one in the config file.') - upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY"] + upload_prefix = app.config['CSV_TO_HIVE_UPLOAD_DIRECTORY'] dest_path = os.path.join(table_name, filename) - upload_path = config['UPLOAD_FOLDER'] + secure_filename( - form.csv_file.data.filename) + upload_path = app.config['UPLOAD_FOLDER'] + \ + secure_filename(form.csv_file.data.filename) column_names = get_column_names(upload_path) - schema_definition = ", ".join( - [s + " STRING " for s in column_names]) + schema_definition = ', '.join( + [s + ' STRING ' for s in column_names]) s3 = boto3.client('s3') - location =\ - os.path.join("s3a://", bucket_path, upload_prefix, table_name) + location = os.path.join('s3a://', bucket_path, upload_prefix, table_name) s3.upload_file( - upload_path, - 'airbnb-superset', + upload_path, 'airbnb-superset', os.path.join(upload_prefix, table_name, filename)) sql = """CREATE EXTERNAL TABLE {table_name} ( {schema_definition} ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS @@ -840,12 +825,11 @@ def get_column_names(filepath): logging.info(form.con.data) engine = create_engine(form.con.data) engine.execute(sql) - return True + return (True, '') except Exception as e: logging.exception(e) logging.info(sql) - flash(BaseEngineSpec.extract_error_message(e), "alert") - return False + return (False, BaseEngineSpec.extract_error_message(e)) @classmethod def convert_dttm(cls, target_type, dttm): diff --git a/superset/forms.py b/superset/forms.py index 48ea744c5050e..a07790440ffde 100644 --- a/superset/forms.py +++ b/superset/forms.py @@ -4,13 +4,13 @@ from __future__ import print_function from __future__ import unicode_literals -from flask_babel import lazy_gettext as _ from flask_appbuilder.fieldwidgets import BS3TextFieldWidget from flask_appbuilder.forms import DynamicForm -from flask_wtf.file import FileField, FileAllowed, FileRequired +from flask_babel import lazy_gettext as _ +from flask_wtf.file import FileAllowed, FileField, FileRequired from wtforms import ( - BooleanField, SelectField, IntegerField, StringField) -from wtforms.validators import DataRequired, InputRequired, Optional, NumberRange + BooleanField, IntegerField, SelectField, StringField) +from wtforms.validators import DataRequired, NumberRange, Optional from superset import app @@ -34,6 +34,11 @@ class CsvToDatabaseForm(DynamicForm): description=_('database in which to add above table.'), validators=[DataRequired()], choices=[]) + sep = StringField( + _('Delimiter'), + description=_('Delimiter used by CSV file (for whitespace use \s+).'), + validators=[DataRequired()], + widget=BS3TextFieldWidget()) if_exists = SelectField( _('Table Exists'), description=_( @@ -51,11 +56,6 @@ class CsvToDatabaseForm(DynamicForm): validators=[Optional()], widget=BS3TextFieldWidget(), filters=[lambda x: x or None]) - sep = StringField( - _('Delimiter'), - description=_('Delimiter used by CSV file (for whitespace use \s+).'), - validators=[DataRequired()], - widget=BS3TextFieldWidget()) header = IntegerField( _('Header Row'), description=_( @@ -65,15 +65,6 @@ class CsvToDatabaseForm(DynamicForm): validators=[Optional()], widget=BS3TextFieldWidget(), filters=[lambda x: x or None]) - names = StringField( - _('Column Names'), - description=_( - 'List of comma-separated column names to use if ' - 'header row not specified above. Leave empty if header ' - 'field populated.'), - validators=[Optional()], - widget=BS3TextFieldWidget(), - filters=[lambda x: x or None]) index_col = IntegerField( _('Index Column'), description=_( @@ -82,19 +73,6 @@ class CsvToDatabaseForm(DynamicForm): validators=[Optional(), NumberRange(0, 1E+20)], widget=BS3TextFieldWidget(), filters=[lambda x: x or None]) - squeeze = BooleanField( - _('Squeeze'), - description=_( - 'Parse the data as a series (specify ' - 'this option if the data contains only one column.)')) - prefix = StringField( - _('Prefix'), - description=_( - 'Prefix to add to column numbers when no header ' - '(e.g. "X" for "X0, X1").'), - validators=[Optional()], - widget=BS3TextFieldWidget(), - filters=[lambda x: x or None]) mangle_dupe_cols = BooleanField( _('Mangle Duplicate Columns'), description=_('Specify duplicate columns as "X.0, X.1".')) @@ -126,47 +104,12 @@ class CsvToDatabaseForm(DynamicForm): description=_( 'Use Pandas to interpret the datetime format ' 'automatically.')) - dayfirst = BooleanField( - _('Day First'), - description=_( - 'Use DD/MM (European/International) date format.')) - thousands = StringField( - _('Thousands Separator'), - description=_('Separator for values in thousands.'), - validators=[Optional()], - widget=BS3TextFieldWidget(), - filters=[lambda x: x or None]) decimal = StringField( _('Decimal Character'), description=_('Character to interpret as decimal point.'), validators=[Optional()], widget=BS3TextFieldWidget(), filters=[lambda x: x or '.']) - quotechar = StringField( - _('Quote Character'), - description=_( - 'Character used to denote the start and end of a quoted item.'), - validators=[Optional()], - widget=BS3TextFieldWidget(), - filters=[lambda x: x or "'"]) - escapechar = StringField( - _('Escape Character'), - description=_('Character used to escape a quoted item.'), - validators=[Optional()], - widget=BS3TextFieldWidget(), - filters=[lambda x: x or None]) - comment = StringField( - _('Comment Character'), - description=_('Character used to denote the start of a comment.'), - validators=[Optional()], - widget=BS3TextFieldWidget(), - filters=[lambda x: x or None]) - error_bad_lines = BooleanField( - _('Error On Bad Lines'), - description=_( - 'Error on bad lines (e.g. a line with ' - 'too many commas). If false these bad lines will instead ' - 'be dropped from the resulting dataframe.')) index = BooleanField( _('Dataframe Index'), description=_('Write dataframe index as a column.')) diff --git a/superset/views/core.py b/superset/views/core.py index 85dfe4a6e60b0..679803a7571c4 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -13,11 +13,11 @@ import time import traceback from urllib import parse + from flask import ( flash, g, Markup, redirect, render_template, request, Response, url_for, ) from flask_appbuilder import expose, SimpleFormView - from flask_appbuilder.actions import action from flask_appbuilder.models.sqla.interface import SQLAInterface from flask_appbuilder.security.decorators import has_access_api @@ -43,7 +43,6 @@ from superset.models.sql_lab import Query from superset.sql_parse import SupersetQuery from superset.utils import has_access, merge_extra_filters, QueryStatus - from .base import ( api, BaseSupersetView, CsvResponse, DeleteMixin, get_error_msg, get_user_roles, json_error_response, SupersetFilter, SupersetModelView, @@ -327,16 +326,12 @@ class CsvToDatabaseView(SimpleFormView): def form_get(self, form): form.sep.data = ',' form.header.data = 0 - form.squeeze.data = False - form.names.data = None form.mangle_dupe_cols.data = True form.skipinitialspace.data = False form.skip_blank_lines.data = True form.parse_dates.data = True form.infer_datetime_format.data = True - form.dayfirst.data = False form.decimal.data = '.' - form.error_bad_lines.data = False form.if_exists.data = 'append' all_datasources = db.session.query( models.Database.sqlalchemy_uri, @@ -351,18 +346,18 @@ def upload_file(csv_file): csv_file.save(os.path.join(config['UPLOAD_FOLDER'], filename)) return filename - form.names.data =\ - form.names.data.split(",") if form.names.data else None + csv_file = form.csv_file.data + upload_file(csv_file) + table = SqlaTable(table_name=form.name.data) database = ( db.session.query(models.Database) .filter_by(sqlalchemy_uri=form.data.get('con')) .one() ) - upload_file(form.csv_file.data) - table = SqlaTable(table_name=form.name.data) - table.database_id = database.id table.database = database - successful = database.db_engine_spec.upload_csv(form, table) + table.database_id = database.id + successful, message = database.db_engine_spec.create_table_from_csv(form, table) + os.remove(os.path.join(config['UPLOAD_FOLDER'], csv_file.filename)) if successful: # Go back to welcome page / splash screen db_name = db.session.query(models.Database.database_name)\ @@ -375,6 +370,9 @@ def upload_file(csv_file): flash(message, 'info') return redirect('/tablemodelview/list/') + else: + flash(message, 'info') + appbuilder.add_view_no_menu(CsvToDatabaseView) @@ -2532,13 +2530,13 @@ class CssTemplateAsyncModelView(CssTemplateModelView): appbuilder.add_link( 'Upload a CSV', - label=__("Upload a CSV"), + label=__('Upload a CSV'), href='/csvtodatabaseview/form', - icon="fa-upload", + icon='fa-upload', category='Sources', - category_label=__("Sources"), + category_label=__('Sources'), category_icon='fa-wrench',) -appbuilder.add_separator("Sources") +appbuilder.add_separator('Sources') @app.after_request diff --git a/tests/core_tests.py b/tests/core_tests.py index 40a37f2f9884e..2edf8988a329e 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -15,10 +15,9 @@ import unittest from flask import escape -from sqlalchemy.engine import create_engine import sqlalchemy as sqla -from superset import app, appbuilder, db, jinja_context, sm, sql_lab, utils +from superset import appbuilder, db, jinja_context, sm, sql_lab, utils from superset.connectors.sqla.models import SqlaTable from superset.models import core as models from superset.models.sql_lab import Query @@ -791,22 +790,18 @@ def test_viz_get_fillna_for_columns(self): def test_import_csv(self): self.login(username='admin') - config = app.config - print(config['SQLALCHEMY_DATABASE_URI']) #add the id mappings! don't go straight to the - filename = "testCSV.csv" + filename = 'testCSV.csv' table_name = ''.join( random.choice(string.ascii_uppercase) for _ in range(5)) test_file = open(filename, 'w+') - test_file.write("a,b\n") - test_file.write("john,1\n") - test_file.write("paul,2\n") + test_file.write('a,b\n') + test_file.write('john,1\n') + test_file.write('paul,2\n') test_file.close() - engine = create_engine(config['SQLALCHEMY_DATABASE_URI']) - main_db_uri = db.session.query( models.Database.sqlalchemy_uri)\ - .filter_by(database_name="main").all() + .filter_by(database_name='main').all() test_file = open(filename, 'rb') form_data = { @@ -829,10 +824,7 @@ def test_import_csv(self): try: # ensure uploaded successfully form_post = self.get_resp(url, data=form_data) - if "Not a valid choice" in form_post: - print("Not a valid choice") - return - assert 'CSV file "testCSV.csv" uploaded to table' in form_post + assert 'CSV file \"testCSV.csv\" uploaded to table' in form_post finally: os.remove(filename) From 80cead575c71009f465df19859f4748de2616e37 Mon Sep 17 00:00:00 2001 From: timifasubaa <30888507+timifasubaa@users.noreply.github.com> Date: Mon, 27 Nov 2017 13:59:25 -0800 Subject: [PATCH 7/8] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4d1f6a5ea71d9..d32057c8d7e59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,7 @@ before_install: before_script: - mysql -u root -e "DROP DATABASE IF EXISTS superset; CREATE DATABASE superset DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci" - mysql -u root -e "CREATE USER 'mysqluser'@'localhost' IDENTIFIED BY 'mysqluserpassword';" - - mysql -u root -e "GRANT ALL ON *.* TO 'mysqluser'@'localhost';" + - mysql -u root -e "GRANT ALL ON superset.* TO 'mysqluser'@'localhost';" - psql -U postgres -c "CREATE DATABASE superset;" - psql -U postgres -c "CREATE USER postgresuser WITH PASSWORD 'pguserpassword';" - export PATH=${PATH}:/tmp/hive/bin From ba08a1df482d0874ef2aeeeea063596fca40d12a Mon Sep 17 00:00:00 2001 From: timifasubaa <30888507+timifasubaa@users.noreply.github.com> Date: Mon, 27 Nov 2017 14:04:28 -0800 Subject: [PATCH 8/8] Update tox.ini --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 78198ea190894..b9b8c11047b96 100644 --- a/tox.ini +++ b/tox.ini @@ -68,7 +68,7 @@ commands = [testenv:py27-mysql] basepython = python2.7 setenv = - SUPERSET__SQLALCHEMY_DATABASE_URI = mysql://mysqluser:mysqluserpassword@localhost/superset?charset=utf8 + SUPERSET__SQLALCHEMY_DATABASE_URI = mysql://root@localhost/superset?charset=utf8 [testenv:py34-mysql] basepython = python3.4