From 1b7646ea2fd3404864929bc67f614e51d24a9234 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Mon, 7 Oct 2013 11:16:14 -0700 Subject: [PATCH] Upgrade: Add script to upgrade to v1.3 from v1.0, v1.1, v1.2 Pivotal Tracker: 57615140 Changes: - Added all changelist files to upgrade to v1.3 - Updated script to new upgrade - Fixed issues around whitespace between arguments - Release notes for v1.3 - Updated version numbers - Update PGXN versioning style to accomodate requirements --- ReleaseNotes.txt | 36 ++ deploy/PGXN/CMakeLists.txt | 4 +- deploy/gppkg/CMakeLists.txt | 2 +- deploy/postflight.sh | 2 +- src/config/Version.yml | 2 +- src/madpack/changelist.yaml | 34 +- ...t_1.0_1.2.yaml => changelist_1.0_1.3.yaml} | 36 +- src/madpack/changelist_1.1_1.3.yaml | 61 ++ src/madpack/madpack.py | 78 ++- src/madpack/upgrade_util.py | 536 +++++++++--------- .../postgres/modules/regress/linear.py_in | 149 ++++- .../postgres/modules/regress/linear.sql_in | 144 +++-- .../modules/regress/test/linear.sql_in | 9 +- .../modules/regress/test/logistic.sql_in | 2 +- 14 files changed, 710 insertions(+), 385 deletions(-) rename src/madpack/{changelist_1.0_1.2.yaml => changelist_1.0_1.3.yaml} (57%) create mode 100644 src/madpack/changelist_1.1_1.3.yaml diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt index 6c82152a1..ecaf4d57d 100644 --- a/ReleaseNotes.txt +++ b/ReleaseNotes.txt @@ -8,6 +8,42 @@ A complete list of changes for each release can be obtained by viewing the git commit history located at https://github.com/madlib/madlib/commits/master. Current list of bugs and issues can be found at http://jira.madlib.net. +-------------------------------------------------------------------------------- +MADlib v1.3 + +Release Date: 2013-October-03 + +New Features: +* Cox Proportional Hazards: + - Added stratification support for Cox PH models. Stratification is used as + shorthand for building a Cox model that allows for more than one stratum, + and hence, allows for more than one baseline hazard function. + Stratification provides two pieces of key, flexible functionality for the + end user of Cox models: + -- Allows a categorical variable Z to be appropriately accounted for in + the model without estimating its predictive impact on the response + variable. + -- Categorical variable Z is predictive/associated with the response + variable, but Z may not satisfy the proportional hazards assumption + - Added a new function (cox_zph) that tests the proportional hazards + assumption of a Cox model. This allows the user to build Cox models and then + verify the relevance of the model. +* NULL Handling: + - Modified behavior of linear and logistic regression to 'omit' rows + containing NULL values for any of the dependent and independent variables. + The number of rows skipped is provided as part of the output table. + +Deprecated functions: + - Cox Proportional Hazard function has been renamed to 'coxph_train'. + Old function names ('cox_prop_hazards' and 'cox_prop_hazards_regr') + have been deprecated, and will be removed in the next major version update. + - The aggregate form of linear regression ('linregr') has been deprecated. + The stored-procedure form ('linregr_train') should be used instead. + +Bug Fixes: + - Fixed a memory leak in the Apriori algorithm. + + -------------------------------------------------------------------------------- MADlib v1.2 diff --git a/deploy/PGXN/CMakeLists.txt b/deploy/PGXN/CMakeLists.txt index 7e57c1966..39c4c8233 100644 --- a/deploy/PGXN/CMakeLists.txt +++ b/deploy/PGXN/CMakeLists.txt @@ -3,8 +3,10 @@ # ------------------------------------------------------------------------------ set(MADLIB_PGXN_RELEASE_NUMBER 1) +# set(MADLIB_PGXN_VERSION_STR +# "${MADLIB_VERSION_MAJOR}.${MADLIB_VERSION_MINOR}.${MADLIB_VERSION_PATCH}release${MADLIB_PGXN_RELEASE_NUMBER} set(MADLIB_PGXN_VERSION_STR - "${MADLIB_VERSION_MAJOR}.${MADLIB_VERSION_MINOR}.${MADLIB_VERSION_PATCH}release${MADLIB_PGXN_RELEASE_NUMBER}") + "${MADLIB_VERSION_MAJOR}.${MADLIB_VERSION_MINOR}.${MADLIB_VERSION_PATCH}") set(MADLIB_PGXN_NAME "madlib-pgxn-${MADLIB_PGXN_VERSION_STR}") configure_file(META.json.in META.json) diff --git a/deploy/gppkg/CMakeLists.txt b/deploy/gppkg/CMakeLists.txt index b9a34307a..dfa4c704f 100644 --- a/deploy/gppkg/CMakeLists.txt +++ b/deploy/gppkg/CMakeLists.txt @@ -2,7 +2,7 @@ # Packaging for Greenplum's gppkg # ------------------------------------------------------------------------------ -set(MADLIB_GPPKG_VERSION "1.7") +set(MADLIB_GPPKG_VERSION "1.7.1") set(MADLIB_GPPKG_RELEASE_NUMBER 1) set(MADLIB_GPPKG_RPM_SOURCE_DIR "${CMAKE_BINARY_DIR}/_CPack_Packages/Linux/RPM/${CPACK_PACKAGE_FILE_NAME}" diff --git a/deploy/postflight.sh b/deploy/postflight.sh index cc106e4df..9e4ea3fb2 100755 --- a/deploy/postflight.sh +++ b/deploy/postflight.sh @@ -2,7 +2,7 @@ # $0 - Script Path, $1 - Package Path, $2 - Target Location, and $3 - Target Volumn -MADLIB_VERSION=1.2 +MADLIB_VERSION=1.3 find /usr/local/madlib/bin -type d -exec cp -RPf {} /usr/local/madlib/old_bin \; 2>/dev/null find /usr/local/madlib/bin -depth -type d -exec rm -r {} \; 2>/dev/null diff --git a/src/config/Version.yml b/src/config/Version.yml index 369bd3fdc..4aa48c34f 100644 --- a/src/config/Version.yml +++ b/src/config/Version.yml @@ -1 +1 @@ -version: 1.2 +version: 1.3 diff --git a/src/madpack/changelist.yaml b/src/madpack/changelist.yaml index 48db15d76..c5cf31071 100644 --- a/src/madpack/changelist.yaml +++ b/src/madpack/changelist.yaml @@ -1,4 +1,4 @@ -# Changelist for MADlib version 1.1 to 1.2 +# Changelist for MADlib version 1.2 to 1.3 # This file contains all changes that were introduced in a new version of # MADlib. This changelist is used by the upgrade script to detect what objects @@ -9,11 +9,11 @@ # file installed on the upgrade version. All other files (that don't have # updates), are cleaned up to remove object replacements new module: - arima: - arima_forecast: # Changes in the types (UDT) including removal and modification udt: + __logregr_result: + linregr_result: # List of the UDF changes that affect the user externally. This includes change # in function name, change in argument order or argument types, and removal of @@ -22,10 +22,36 @@ udt: # are user views dependent on this function, since the original function will # not be present in the upgraded version. udf: + # linear regression: 'num_processed' added in 'linregr_result' + - linregr_final: + rettype: schema_madlib.linregr_result + argument: schema_madlib.bytea8 + - linregr_merge_states: + rettype: schema_madlib.bytea8 + argument: schema_madlib.bytea8, schema_madlib.bytea8 + - linregr_transition: + rettype: schema_madlib.bytea8 + argument: schema_madlib.bytea8, double precision, double precision[] + + # logistic regression: 'num_processed' added in '__logregr_result' + - __logregr_cg_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] + + - __logregr_irls_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] + + - __logregr_igd_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] # Changes to aggregates (UDA) including removal and modification # Overloaded functions should be mentioned separately uda: - + - linregr: + rettype: schema_madlib.linregr_result + argument: double precision, double precision[] + # Cast operators (UDC) updated/added in v1.1 udc: diff --git a/src/madpack/changelist_1.0_1.2.yaml b/src/madpack/changelist_1.0_1.3.yaml similarity index 57% rename from src/madpack/changelist_1.0_1.2.yaml rename to src/madpack/changelist_1.0_1.3.yaml index c53e7ba14..3b96f28d2 100644 --- a/src/madpack/changelist_1.0_1.2.yaml +++ b/src/madpack/changelist_1.0_1.3.yaml @@ -1,4 +1,4 @@ -# Changelist for MADlib version 1.0 to 1.2 +# Changelist for MADlib version 1.0 to 1.3 # This file contains all changes that were introduced in a new version of # MADlib. This changelist is used by the upgrade script to detect what objects @@ -17,7 +17,8 @@ new module: # Changes in the types (UDT) including removal and modification udt: - + __logregr_result: + linregr_result: # List of the UDF changes that affect the user externally. This includes change # in function name, change in argument order or argument types, and removal of # the function. In each case, the original function is as good as removed and a @@ -30,9 +31,36 @@ udf: rettype: schema_madlib.matrix_result argument: matrix_in text, matrix_out text + # linear regression: 'num_processed' added in 'linregr_result' + - linregr_final: + rettype: schema_madlib.linregr_result + argument: schema_madlib.bytea8 + - linregr_merge_states: + rettype: schema_madlib.bytea8 + argument: schema_madlib.bytea8, schema_madlib.bytea8 + - linregr_transition: + rettype: schema_madlib.bytea8 + argument: schema_madlib.bytea8, double precision, double precision[] + + # logistic regression: 'num_processed' added in '__logregr_result' + - __logregr_cg_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] + + - __logregr_irls_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] + + - __logregr_igd_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] + # Changes to aggregates (UDA) including removal and modification # Overloaded functions should be mentioned separately uda: - + - linregr: + rettype: schema_madlib.linregr_result + argument: double precision, double precision[] + # Cast operators (UDC) updated/added in v1.1/v1.2 -udc: +udc: \ No newline at end of file diff --git a/src/madpack/changelist_1.1_1.3.yaml b/src/madpack/changelist_1.1_1.3.yaml new file mode 100644 index 000000000..df432eaf8 --- /dev/null +++ b/src/madpack/changelist_1.1_1.3.yaml @@ -0,0 +1,61 @@ + +# Changelist for MADlib version 1.1 to 1.3 + +# This file contains all changes that were introduced in a new version of +# MADlib. This changelist is used by the upgrade script to detect what objects +# should be upgraded (while retaining all other objects from the previous version) + +# New modules (actually .sql_in files) added in upgrade version +# For these files the sql_in code is retained as is with the functions in the +# file installed on the upgrade version. All other files (that don't have +# updates), are cleaned up to remove object replacements +new module: + arima: + arima_forecast: + + +# Changes in the types (UDT) including removal and modification +udt: + __logregr_result: + linregr_result: + +# List of the UDF changes that affect the user externally. This includes change +# in function name, change in argument order or argument types, and removal of +# the function. In each case, the original function is as good as removed and a +# new function is created. In such cases, we should abort the upgrade if there +# are user views dependent on this function, since the original function will +# not be present in the upgraded version. +udf: + # linear regression: 'num_processed' added in 'linregr_result' + - linregr_final: + rettype: schema_madlib.linregr_result + argument: schema_madlib.bytea8 + - linregr_merge_states: + rettype: schema_madlib.bytea8 + argument: schema_madlib.bytea8, schema_madlib.bytea8 + - linregr_transition: + rettype: schema_madlib.bytea8 + argument: schema_madlib.bytea8, double precision, double precision[] + + # logistic regression: 'num_processed' added in '__logregr_result' + - __logregr_cg_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] + + - __logregr_irls_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] + + - __logregr_igd_result: + rettype: schema_madlib.__logregr_result + argument: double precision[] + +# Changes to aggregates (UDA) including removal and modification +# Overloaded functions should be mentioned separately +uda: + - linregr: + rettype: schema_madlib.linregr_result + argument: double precision, double precision[] + +# Cast operators (UDC) updated/added in v1.1 +udc: diff --git a/src/madpack/madpack.py b/src/madpack/madpack.py index f87841fa0..7f9c6f9bf 100755 --- a/src/madpack/madpack.py +++ b/src/madpack/madpack.py @@ -94,7 +94,7 @@ def __error(msg, stop): # @param msg info message # @param verbose prints only if True ## # # # # # # # # # # # # # # # # # # # # # # # # # # # # -def __info(msg, verbose): +def __info(msg, verbose=True): # Print to stdout if verbose: print this + ' : INFO : ' + msg @@ -227,8 +227,8 @@ def __run_sql_file(schema, maddir_mod_py, module, sqlfile, __info(sub_module, False) # Special treatment for new module and 'svec' module - if (sub_module not in sc.get_change_handler().get_newmodule()) and \ - not (sub_module == 'svec' and 'svec' in sc.get_change_handler().get_udt()): + if (sub_module not in sc.get_change_handler().newmodule) and \ + not (sub_module == 'svec' and 'svec' in sc.get_change_handler().udt): sql = open(tmpfile).read() sql = sc.cleanup(sql) open(tmpfile, 'w').write(sql) @@ -373,7 +373,8 @@ def __plpy_check(py_min_ver): __info("Testing PL/Python environment...", True) # Check PL/Python existence - rv = __run_sql_query("SELECT count(*) AS CNT FROM pg_language WHERE lanname = 'plpythonu'", True) + rv = __run_sql_query("SELECT count(*) AS CNT FROM pg_language " + "WHERE lanname = 'plpythonu'", True) if int(rv[0]['cnt']) > 0: __info("> PL/Python already installed", verbose) else: @@ -524,51 +525,80 @@ def __db_upgrade(schema, dbrev): abort = False if td.has_dependency(): + __info("*"*50, True) __info("\tFollowing user tables are dependent on updated MADlib types:", True) __info(td.get_dependency_str(), True) + __info("*"*50, True) cd_udt = [udt for udt in td.get_depended_udt() - if udt in ch.get_udt()] + if udt in ch.udt] if len(cd_udt) > 0: __error(""" - User has objects dependent on updated MADlib types ({0})! - These objects need to be dropped before starting upgrade again. Aborting upgrade ... - """.format('\n'.join(cd_udt)), False) + User has objects dependent on following updated MADlib types! + {0} + These objects need to be dropped before upgrading. + """.format('\n\t\t\t'.join(cd_udt)), False) + + # TODO: Remove this after v1.3 + # we add special handling for 'linregr_result' + if 'linregr_result' in cd_udt: + __info("""Dependency on 'linregr_result' could be due to objects + created from the output of the aggregate 'linregr'. + Please refer to the Linear Regression documentation + + for the recommended solution. + """, False) abort = True if vd.has_dependency(): + __info("*"*50, True) __info("\tFollowing user views are dependent on updated MADlib objects:", True) __info(vd.get_dependency_graph_str(), True) + __info("*"*50, True) c_udf = ch.get_udf_signature() d_udf = vd.get_depended_func_signature(False) cd_udf = [udf for udf in d_udf if udf in c_udf] if len(cd_udf) > 0: __error(""" - User has objects dependent on updated MADlib functions ({0})! - These objects will not fail to work with the new functions and - need to be dropped before starting upgrade again. Aborting upgrade ... - """.format('\n'.join(cd_udf)), False) + User has objects dependent on following updated MADlib functions! + {0} + These objects will fail to work with the updated functions and + need to be dropped before starting upgrade again. + """.format('\n\t\t\t\t\t'.join(cd_udf)), False) abort = True + c_uda = ch.get_uda_signature() d_uda = vd.get_depended_func_signature(True) cd_uda = [uda for uda in d_uda if uda in c_uda] if len(cd_uda) > 0: __error(""" - User has objects dependent on updated MADlib functions ({0})! - These objects will not fail to work with the new aggregates and - need to be dropped before starting upgrade again. Aborting upgrade ... - """.format('\n'.join(cd_uda)), False) + User has objects dependent on following updated MADlib aggregates! + {0} + These objects will fail to work with the new aggregates and + need to be dropped before starting upgrade again. + """.format('\n\t\t\t\t\t'.join(cd_uda)), False) abort = True if abort: __error('------- Upgrade aborted. -------', True) else: - __info("No explicit dependency problem found, continuing to upgrade ...", True) + __info("No dependency problem found, continuing to upgrade ...", True) if vd.has_dependency(): vd.save_and_drop() __info("\tReading existing UDAs/UDTs...", False) - sc = ScriptCleaner(schema, portid, con_args, ch) + try: + sc = ScriptCleaner(schema, portid, con_args, ch) + except Exception as e: + __info(str(e), True) + raise e + __info("Script Cleaner initialized ...", False) + + + # __info("\tChanged functions: " + str(ch.udf), True) + # __info("\tChanged aggregates: " + str(ch.uda), True) + # __info("\tChanged types: " + str(ch.udt), True) + # __info("\tChanged casts: " + str(ch.udc), True) ch.drop_changed_uda() ch.drop_changed_udt() @@ -1077,8 +1107,8 @@ def main(argv): # FIXME: Change this to get the previous version from a config file if float(dbrev) < 1.0: - __info("""The version gap is too large, only release-by-release - incremental upgrade is supported.""", True) + __info("""The version gap is too large, upgrade is supported only for + packages greater than or equal to v1.0.""", True) return # 3) Run upgrade @@ -1086,11 +1116,11 @@ def main(argv): __plpy_check(py_min_ver) __db_upgrade(schema, dbrev) except Exception as e: - __error("MADlib upgrade failed.", True) #Uncomment the following lines when debugging - #print "Exception: " + str(e) - #print sys.exc_info() - #traceback.print_tb(sys.exc_info()[2]) + print "Exception: " + str(e) + print sys.exc_info() + traceback.print_tb(sys.exc_info()[2]) + __error("MADlib upgrade failed.", True) ### # COMMAND: install-check diff --git a/src/madpack/upgrade_util.py b/src/madpack/upgrade_util.py index 6d67fd449..f5a3755a6 100644 --- a/src/madpack/upgrade_util.py +++ b/src/madpack/upgrade_util.py @@ -1,30 +1,31 @@ import re -import sys import yaml from collections import defaultdict import os -""" -@brief Wrapper function for ____run_sql_query -""" + def run_sql(sql, portid, con_args): + """ + @brief Wrapper function for ____run_sql_query + """ from madpack import ____run_sql_query return ____run_sql_query(sql, True, portid, con_args) -""" -@brief Get the signature of a UDF/UDA for comparison -""" + def get_signature_for_compare(schema, proname, rettype, argument): - signature = '%s %s.%s(%s)' % ( - rettype.strip(), schema.strip(), proname.strip(), argument.strip()) - signature = re.sub('\s+', ' ', signature) + """ + @brief Get the signature of a UDF/UDA for comparison + """ + signature = '{0} {1}.{2}({3})'.format(rettype.strip(), schema.strip(), + proname.strip(), argument.strip()) signature = re.sub('"', '', signature) return signature.lower() -""" -@brief Base class for handling the upgrade -""" + class UpgradeBase: + """ + @brief Base class for handling the upgrade + """ def __init__(self, schema, portid, con_args): self._schema = schema.lower() self._portid = portid @@ -46,21 +47,20 @@ def _get_schema_oid(self): SELECT oid FROM pg_namespace WHERE nspname = '{schema}' """.format(schema=self._schema))[0]['oid'] - - """ - @brief Get the function name, return type, and arguments given an oid - @note The function can only handle the case that proallargtypes is null, - refer to pg_catalog.pg_get_function_identity_argument and - pg_catalog.pg_get_function_result in PG for a complete implementation, which are - not supported by GP - """ def _get_function_info(self, oid): + """ + @brief Get the function name, return type, and arguments given an oid + @note The function can only handle the case that proallargtypes is null, + refer to pg_catalog.pg_get_function_identity_argument and + pg_catalog.pg_get_function_result in PG for a complete implementation, which are + not supported by GP + """ row = self._run_sql(""" SELECT max(proname) AS proname, max(rettype) AS rettype, array_to_string( - array_agg(argname || ' ' || argtype order by i), ', ') AS argument + array_agg(argname || ' ' || argtype order by i), ',') AS argument FROM ( SELECT @@ -81,14 +81,16 @@ def _get_function_info(self, oid): oid = {oid} ) AS f """.format(oid=oid)) - return {"proname": row[0]['proname'], 'rettype': row[0]['rettype'], - 'argument': row[0]['argument']} + return {"proname": row[0]['proname'], + "rettype": row[0]['rettype'], + "argument": row[0]['argument']} + -""" -@brief This class reads changes from the configuration file and handles -the dropping of objects -""" class ChangeHandler(UpgradeBase): + """ + @brief This class reads changes from the configuration file and handles + the dropping of objects + """ def __init__(self, schema, portid, con_args, maddir, mad_dbrev): UpgradeBase.__init__(self, schema, portid, con_args) self._opr_ind_svec = None @@ -102,11 +104,10 @@ def __init__(self, schema, portid, con_args, maddir, mad_dbrev): self._udc = None self._load() - - """ - @brief Get the UDOps which are independent of svec in the current version - """ def _get_opr_indepent_svec(self): + """ + @brief Get the User Defined Operators independent of svec in the current version + """ rows = self._run_sql(""" SELECT oprname, @@ -141,16 +142,19 @@ def _load_config_param(self, config_iterable): make all function names lower case to ensure ease of comparison. Args: - @param config_dict is a dictionary with key as object name - (eg. function name) and value as the details for - the object. The details for the object are assumed to + @param config_iterable is an iterable of dictionaries, each with + key = object name (eg. function name) and value = details + for the object. The details for the object are assumed to be in a dictionary with following keys: rettype: Return type argument: List of arguments Returns: A dictionary that lists all specific objects (functions, aggregates, etc) - with object name as key and another dictionary with objects details + with object name as key and a list as value, where the list + contains all the items present in + + another dictionary with objects details as the value. """ _return_obj = defaultdict(list) @@ -162,70 +166,61 @@ def _load_config_param(self, config_iterable): if obj_details['argument'] is not None: argument = obj_details['argument'].lower().replace( 'schema_madlib', self._schema) + all_arguments = [each_arg.strip() + for each_arg in argument.split(',')] _return_obj[obj_name].append( - {'rettype': rettype, 'argument': argument}) + {'rettype': rettype, + 'argument': ','.join(all_arguments)}) return _return_obj - """ - @brief Load the configuration file - """ def _load(self): + """ + @brief Load the configuration file + """ # _mad_dbrev = 1.0 if float(self._mad_dbrev) < 1.1: - filename = os.path.join(self._maddir, 'madpack' , 'changelist_1.0_1.2.yaml') + filename = os.path.join(self._maddir, 'madpack', + 'changelist_1.0_1.3.yaml') # _mad_dbrev = 1.1 + elif float(self._mad_dbrev) < 1.2: + filename = os.path.join(self._maddir, 'madpack', + 'changelist_1.1_1.3.yaml') else: - filename = os.path.join(self._maddir, 'madpack' , 'changelist.yaml') - - config = yaml.load(open(filename)) - - if config['new module'] is not None: - self._newmodule = config['new module'] - else: - self._newmodule = {} + filename = os.path.join(self._maddir, 'madpack', + 'changelist.yaml') - if config['udt'] is not None: - self._udt = config['udt'] - else: - self._udt = {} - - if config['udc'] is not None: - self._udc = config['udc'] - else: - self._udc = {} + config = yaml.load(open(filename)) + self._newmodule = config['new module'] if config['new module'] else {} + self._udt = config['udt'] if config['udt'] else {} + self._udc = config['udc'] if config['udc'] else {} self._udf = self._load_config_param(config['udf']) self._uda = self._load_config_param(config['uda']) - """ - @brief Get the list of new modules - """ - def get_newmodule(self): + @property + def newmodule(self): return self._newmodule - """ - @brief Get the list of changed UDTs - """ - def get_udt(self): + @property + def udt(self): return self._udt - """ - @brief Get the list of changed UDAs - """ - def get_uda(self): + @property + def uda(self): return self._uda - """ - @brief Get the list of changed UDCs - @note This is a UDC in utilities module - """ - def get_udc(self): + @property + def udf(self): + return self._udf + + @property + def udc(self): return self._udc - """ - @brief Get the list of UDF signatures for comparison - """ def get_udf_signature(self): + """ + @brief Get the list of UDF signatures for comparison + """ res = defaultdict(bool) for udf in self._udf: for item in self._udf[udf]: @@ -234,10 +229,10 @@ def get_udf_signature(self): res[signature] = True return res - """ - @brief Get the list of UDA signatures for comparison - """ def get_uda_signature(self): + """ + @brief Get the list of UDA signatures for comparison + """ res = defaultdict(bool) for uda in self._uda: for item in self._uda[uda]: @@ -246,18 +241,17 @@ def get_uda_signature(self): res[signature] = True return res - """ - @brief Drop all types that were updated/removed in the new version - @note It is dangerous to drop a UDT becuase there might be many - dependencies - """ def drop_changed_udt(self): + """ + @brief Drop all types that were updated/removed in the new version + @note It is dangerous to drop a UDT becuase there might be many + dependencies + """ # Note that we use CASCADE option here. This might be dangerous because # it may drop some undetected dependent objects (eg. UDCast, UDOp, etc) for udt in self._udt: - self._run_sql(""" - DROP TYPE IF EXISTS {schema}.{udt} CASCADE - """.format(schema=self._schema, udt=udt)) + self._run_sql("DROP TYPE IF EXISTS {0}.{1} CASCADE". + format(self._schema, udt)) if udt == 'svec': # Drop operators defined in the svec module which do not # depend on svec. We will run the whole svec.sql without @@ -273,47 +267,45 @@ def drop_changed_udt(self): nsp_right=self._opr_ind_svec[opr]['nsp_right'], typ_right=self._opr_ind_svec[opr]['typ_right'] )) - """ - @brief Drop all functions (UDF) that were removed in new version - """ + def drop_changed_udf(self): + """ + @brief Drop all functions (UDF) that were removed in new version + """ for udf in self._udf: for item in self._udf[udf]: - self._run_sql(""" - DROP FUNCTION IF EXISTS {schema}.{udf}({arg}) - """.format(schema=self._schema, - udf=udf, - arg=item['argument'])) + self._run_sql("DROP FUNCTION IF EXISTS {schema}.{udf}({arg})". + format(schema=self._schema, + udf=udf, + arg=item['argument'])) - """ - @brief Drop all aggregates (UDA) that were removed in new version - """ def drop_changed_uda(self): + """ + @brief Drop all aggregates (UDA) that were removed in new version + """ for uda in self._uda: for item in self._uda[uda]: - self._run_sql(""" - DROP AGGREGATE IF EXISTS {schema}.{uda}({arg}) - """.format(schema=self._schema, - uda=uda, - arg=item['argument'])) + self._run_sql("DROP AGGREGATE IF EXISTS {schema}.{uda}({arg})". + format(schema=self._schema, + uda=uda, + arg=item['argument'])) - """ - @brief Drop all casts (UDC) that were updated/removed in new version - @note We have special treatment for UDCs defined in the svec module - """ def drop_changed_udc(self): + """ + @brief Drop all casts (UDC) that were updated/removed in new version + @note We have special treatment for UDCs defined in the svec module + """ for udc in self._udc: - self._run_sql(""" - DROP CAST IF EXISTS ({sourcetype} AS {targettype}) - """.format( - sourcetype=self._udc[udc]['sourcetype'], - targettype=self._udc[udc]['targettype'])) + self._run_sql("DROP CAST IF EXISTS ({sourcetype} AS {targettype})". + format(sourcetype=self._udc[udc]['sourcetype'], + targettype=self._udc[udc]['targettype'])) + -""" -@brief This class detects the direct/recursive view dependencies on MADLib -UDFs/UDAs defined in the current version -""" class ViewDependency(UpgradeBase): + """ + @brief This class detects the direct/recursive view dependencies on MADLib + UDFs/UDAs defined in the current version + """ def __init__(self, schema, portid, con_args): UpgradeBase.__init__(self, schema, portid, con_args) self._view2proc = None @@ -358,7 +350,7 @@ def _detect_direct_view_dependency(self): self._view2proc = defaultdict(list) for row in rows: - key= (row['schema'], row['view']) + key = (row['schema'], row['view']) self._view2proc[key].append( (row['procname'], row['procoid'], True if row['proisagg'] == 't' else False)) @@ -443,7 +435,7 @@ def _filter_recursive_view_dependency(self): """ @brief Build the dependency graph (depender-to-dependee adjacency list) """ - def _build_dependency_graph(self, hasProcDependency = False): + def _build_dependency_graph(self, hasProcDependency=False): der2dee = self._view2view.copy() for view in self._view2proc: if view not in self._view2view: @@ -480,7 +472,7 @@ def get_create_order_views(self): del graph[view] for depender in graph: graph[depender] = [r for r in graph[depender] - if r not in remove_list] + if r not in remove_list] if len(remove_list) == 0: break return ordered_views @@ -496,53 +488,43 @@ def get_drop_order_views(self): """ @brief Get the depended UDF/UDA signatures for comparison """ - def get_depended_func_signature(self, isagg = True): + def get_depended_func_signature(self, aggregate=True): res = {} for procs in self._view2proc.values(): for proc in procs: - if proc[2] != isagg: - continue - if (self._schema, proc) not in res: + if proc[2] is aggregate and (self._schema, proc) not in res: funcinfo = self._get_function_info(proc[1]) - signature = get_signature_for_compare( - self._schema, proc[0], funcinfo['rettype'], funcinfo['argument']) + signature = get_signature_for_compare(self._schema, proc[0], + funcinfo['rettype'], + funcinfo['argument']) res[signature] = True return res - """ - @brief Get dependent UDAs - """ - def get_depended_uda(self): + def get_proc_w_dependency(self, aggregate=True): res = [] for procs in self._view2proc.values(): for proc in procs: - if proc[2] == False: - # proc is not an aggregate -> skip - continue - if (self._schema, proc) not in res: + if proc[2] is aggregate and (self._schema, proc) not in res: res.append((self._schema, proc)) res.sort() return res - """ - @brief Get dependent UDFs - """ + def get_depended_uda(self): + """ + @brief Get dependent UDAs + """ + self.get_proc_w_dependency(aggregate=True) + def get_depended_udf(self): - res = [] - for procs in self._view2proc.values(): - for proc in procs: - if proc[2] == True: - # proc is an aggregate -> skip - continue - if (self._schema, proc) not in res: - res.append((self._schema, proc)) - res.sort() - return res + """ + @brief Get dependent UDFs + """ + self.get_proc_w_dependency(aggregate=False) - """ - @brief Save and drop the dependent views - """ def save_and_drop(self): + """ + @brief Save and drop the dependent views + """ self._view2def = {} ordered_views = self.get_drop_order_views() # Save views @@ -564,10 +546,10 @@ def save_and_drop(self): DROP VIEW IF EXISTS {schema}.{view} """.format(schema=view[0], view=view[1])) - """ - @brief Restore the dependent views - """ def restore(self): + """ + @brief Restore the dependent views + """ ordered_views = self.get_create_order_views() for view in ordered_views: row = self._view2def[view] @@ -584,10 +566,10 @@ def restore(self): RESET ROLE """.format( schema=schema, view=view, - definition=definition, owner=owner)) + definition=definition, + owner=owner)) def _node_to_str(self, node): - res = '' if len(node) == 2: res = '%s.%s' % (node[0], node[1]) else: @@ -596,38 +578,36 @@ def _node_to_str(self, node): return res def _nodes_to_str(self, nodes): - res = [] - for node in nodes: - res.append(self._node_to_str(node)) - return res + return [self._node_to_str(i) for i in nodes] - """ - @brief Get the dependency graph string for print - """ def get_dependency_graph_str(self): + """ + @brief Get the dependency graph string for print + """ graph = self._build_dependency_graph(True) - nodes = graph.keys() + nodes = list(graph.keys()) nodes.sort() - res = '\t\tDependency Graph (Depender-Dependee Adjacency List):\n' + res = ["\tDependency Graph (Depender-Dependee Adjacency List):"] for node in nodes: - res += "\t\t%s -> %s\n" % ( - self._node_to_str(node), self._nodes_to_str(graph[node])) - return res[:-1] - -""" -@brief This class detects the table dependencies on MADLib UDTs defined in the -current version -""" + res.append("{0} -> {1}".format(self._node_to_str(node), + self._nodes_to_str(graph[node]))) + return "\n\t\t\t\t".join(res) + + class TableDependency(UpgradeBase): + """ + @brief This class detects the table dependencies on MADLib UDTs defined in the + current version + """ def __init__(self, schema, portid, con_args): UpgradeBase.__init__(self, schema, portid, con_args) self._table2type = None self._detect_table_dependency() - """ - @brief Detect the table dependencies on MADLib UDTs - """ def _detect_table_dependency(self): + """ + @brief Detect the table dependencies on MADLib UDTs + """ rows = self._run_sql(""" SELECT nsp.nspname AS schema, @@ -651,20 +631,20 @@ def _detect_table_dependency(self): self._table2type = defaultdict(list) for row in rows: - key= (row['schema'], row['relation']) + key = (row['schema'], row['relation']) self._table2type[key].append( (row['column'], row['type'])) - """ - @brief Check dependencies - """ def has_dependency(self): + """ + @brief Check dependencies + """ return len(self._table2type) > 0 - """ - @brief Get the list of depended UDTs - """ def get_depended_udt(self): + """ + @brief Get the list of depended UDTs + """ res = defaultdict(bool) for table in self._table2type: for (col, typ) in self._table2type[table]: @@ -672,34 +652,38 @@ def get_depended_udt(self): res[typ] = True return res - """ - @brief Get the dependencies in string for print - """ def get_dependency_str(self): - res = '\t\tTable Dependency (schema.table.column -> type):\n' + """ + @brief Get the dependencies in string for print + """ + res = ['\tTable Dependency (schema.table.column -> MADlib type):'] for table in self._table2type: for (col, udt) in self._table2type[table]: - res += "\t\t%s.%s.%s -> %s\n" % (table[0], table[1], col, udt) - return res[:-1] + res.append("{0}.{1}.{2} -> {3}".format(table[0], table[1], col, + udt)) + return "\n\t\t\t\t".join(res) + -""" -@brief This class removes sql statements from a sql script which should not be -executed during the upgrade -""" class ScriptCleaner(UpgradeBase): + """ + @brief This class removes sql statements from a sql script which should not be + executed during the upgrade + """ def __init__(self, schema, portid, con_args, change_handler): UpgradeBase.__init__(self, schema, portid, con_args) + self._ch = change_handler self._sql = None self._existing_uda = None self._existing_udt = None - self._get_existing_uda() + self._aggregate_patterns = self._get_all_aggregate_patterns() + # print("Number of existing UDAs = " + str(len(self._existing_uda))) + # print("Number of UDAs to not create = " + str(len(self._aggregate_patterns))) self._get_existing_udt() - self._ch = change_handler - """ - @breif Get the existing UDAs in the current version - """ def _get_existing_uda(self): + """ + @brief Get the existing UDAs in the current version + """ rows = self._run_sql(""" SELECT max(proname) AS proname, @@ -730,19 +714,47 @@ def _get_existing_uda(self): GROUP BY procoid """.format(schema=self._schema)) - self._existing_uda = {} + self._existing_uda = defaultdict(list) for row in rows: # Consider about the overloaded aggregates - if row['proname'] not in self._existing_uda: - self._existing_uda[row['proname']] = [] - self._existing_uda[row['proname']].append({ - 'rettype': ['rettype'], - 'argument': row['argument']}) + self._existing_uda[row['proname']].append( + {'rettype': row['rettype'], + 'argument': row['argument']}) + + def _get_all_aggregate_patterns(self): + """ + Creates a list of string patterns that represent all possible + 'CREATE AGGREGATE' statements except ones that are being + replaced/introduced as part of this upgrade. + + """ + self._get_existing_uda() + aggregate_patterns = [] + for each_uda, uda_details in self._existing_uda.iteritems(): + for each_item in uda_details: + if each_uda in self._ch.uda: + if each_item in self._ch.uda[each_uda]: + continue + p_arg_str = '' + argument = each_item['argument'] + args = argument.split(',') + for arg in args: + arg = self._rewrite_type_in(arg.strip()) + if p_arg_str == '': + p_arg_str += '%s\s*' % arg + else: + p_arg_str += ',\s*%s\s*' % arg + p_str = "CREATE\s+(ORDERED\s)*\s*AGGREGATE" \ + "\s+%s\.(%s)\s*\(\s*%s\)(.*?);" % (self._schema.upper(), + each_uda, + p_arg_str) + aggregate_patterns.append(p_str) + return aggregate_patterns - """ - @brief Get the existing UDTs in the current version - """ def _get_existing_udt(self): + """ + @brief Get the existing UDTs in the current version + """ rows = self._run_sql(""" SELECT typname @@ -753,21 +765,19 @@ def _get_existing_udt(self): t.typnamespace = nsp.oid AND nsp.nspname = '{schema}' """.format(schema=self._schema)) - self._existing_udt = [] - for row in rows: - self._existing_udt.append(row['typname']) + self._existing_udt = [row['typname'] for row in rows] - """ - @note The changer_handler is needed for deciding which sql statements to - remove - """ def get_change_handler(self): + """ + @note The changer_handler is needed for deciding which sql statements to + remove + """ return self._ch - """ - @brief Remove comments in the sql script - """ def _clean_comment(self): + """ + @brief Remove comments in the sql script + """ pattern = re.compile(r"""(/\*(.|[\r\n])*?\*/)|(--(.*|[\r\n]))""") res = '' lines = re.split(r'[\r\n]+', self._sql) @@ -784,20 +794,20 @@ def _clean_comment(self): """ def _clean_type(self): # remove 'drop type' - pattern = re.compile('DROP(\s+)TYPE(.*?);', re.DOTALL | re.IGNORECASE); + pattern = re.compile('DROP(\s+)TYPE(.*?);', re.DOTALL | re.IGNORECASE) self._sql = re.sub(pattern, '', self._sql) # remove 'create type' udt_str = '' for udt in self._existing_udt: - if udt in self._ch.get_udt(): + if udt in self._ch.udt: continue if udt_str == '': udt_str += udt else: udt_str += '|' + udt p_str = 'CREATE(\s+)TYPE(\s+)%s\.(%s)(.*?);' % (self._schema.upper(), udt_str) - pattern = re.compile(p_str, re.DOTALL | re.IGNORECASE); + pattern = re.compile(p_str, re.DOTALL | re.IGNORECASE) self._sql = re.sub(pattern, '', self._sql) """ @@ -805,22 +815,25 @@ def _clean_type(self): """ def _clean_cast(self): # remove 'drop cast' - pattern = re.compile('DROP(\s+)CAST(.*?);', re.DOTALL | re.IGNORECASE); + pattern = re.compile('DROP(\s+)CAST(.*?);', re.DOTALL | re.IGNORECASE) self._sql = re.sub(pattern, '', self._sql) # remove 'create cast' udc_str = '' - for udc in self._ch.get_udc(): + for udc in self._ch.udc: if udc_str == '': udc_str += '%s\s+AS\s+%s' % ( - self._ch.get_udc()[udc]['sourcetype'], self._ch.get_udc()[udc]['targettype']) + self._ch.udc[udc]['sourcetype'], + self._ch.udc[udc]['targettype']) else: udc_str += '|' + '%s\s+AS\s+%s' % ( - self._ch.get_udc()[udc]['sourcetype'], self._ch.get_udc()[udc]['targettype']) + self._ch.udc[udc]['sourcetype'], + self._ch.udc[udc]['targettype']) pattern = re.compile('CREATE\s+CAST(.*?);', re.DOTALL | re.IGNORECASE) if udc_str != '': - pattern = re.compile('CREATE\s+CAST\s*\(\s*(?!%s)(.*?);' % udc_str , re.DOTALL | re.IGNORECASE) + pattern = re.compile('CREATE\s+CAST\s*\(\s*(?!%s)(.*?);' % + udc_str, re.DOTALL | re.IGNORECASE) self._sql = re.sub(pattern, '', self._sql) """ @@ -828,11 +841,11 @@ def _clean_cast(self): """ def _clean_operator(self): # remove 'drop operator' - pattern = re.compile('DROP(\s+)OPERATOR(.*?);', re.DOTALL | re.IGNORECASE); + pattern = re.compile('DROP(\s+)OPERATOR(.*?);', re.DOTALL | re.IGNORECASE) self._sql = re.sub(pattern, '', self._sql) # remove 'create operator' - pattern = re.compile(r"""CREATE(\s+)OPERATOR(.*?);""", re.DOTALL | re.IGNORECASE); + pattern = re.compile(r"""CREATE(\s+)OPERATOR(.*?);""", re.DOTALL | re.IGNORECASE) self._sql = re.sub(pattern, '', self._sql) """ @@ -840,64 +853,45 @@ def _clean_operator(self): """ def _rewrite_type_in(self, arg): type_mapper = { - 'smallint':'(int2|smallint)', - 'integer':'(int|int4|integer)', - 'bigint':'(int8|bigint)', - 'double precision':'(float8|double precision)', - 'real':'(float4|real)', - 'character varying':'(varchar|character varying)' + 'smallint': '(int2|smallint)', + 'integer': '(int|int4|integer)', + 'bigint': '(int8|bigint)', + 'double precision': '(float8|double precision)', + 'real': '(float4|real)', + 'character varying': '(varchar|character varying)' } for typ in type_mapper: arg = arg.replace(typ, type_mapper[typ]) return arg.replace('[', '\[').replace(']', '\]') - """ - @brief Remove "drop/create aggregate" statements in the sql script - """ def _clean_aggregate(self): - # remove 'drop aggregate' - pattern = re.compile('DROP(\s+)AGGREGATE(.*?);', re.DOTALL | re.IGNORECASE); - self._sql = re.sub(pattern, '', self._sql) + # remove all drop aggregate statements + self._sql = re.sub(re.compile('DROP(\s+)AGGREGATE(.*?);', + re.DOTALL | re.IGNORECASE), + '', self._sql) + # remove all create aggregate statements except ones that should + # be created as part of upgrade + for each_pattern in self._aggregate_patterns: + regex_pat = re.compile(each_pattern, re.DOTALL | re.IGNORECASE) + self._sql = re.sub(regex_pat, '', self._sql) - # remove 'create aggregate' - uda_str = '' - for uda in self._existing_uda: - for item in self._existing_uda[uda]: - if uda in self._ch.get_uda(): - items = self._ch.get_uda()[uda] - if item in items: - continue - p_arg_str = '' - argument = item['argument'] - args = argument.split(',') - for arg in args: - arg = self._rewrite_type_in(arg.strip()) - if p_arg_str == '': - p_arg_str += '%s\s*' % arg - else: - p_arg_str += ',\s*%s\s*' % arg - p_str = 'CREATE\s+(ORDERED\s)*\s*AGGREGATE\s+%s\.(%s)\s*\(\s*%s\)(.*?);' % ( - self._schema.upper(), uda, p_arg_str) - pattern = re.compile(p_str, re.DOTALL | re.IGNORECASE); - self._sql = re.sub(pattern, '', self._sql) - - """ - @brief Remove "drop function" statements and rewrite "create function" - statements in the sql script - @note We don't drop any function - """ def _clean_function(self): + """ + @brief Remove "drop function" statements and rewrite "create function" + statements in the sql script + @note We don't drop any function + """ # remove 'drop function' - pattern = re.compile(r"""DROP(\s+)FUNCTION(.*?);""", re.DOTALL | re.IGNORECASE); + pattern = re.compile(r"""DROP(\s+)FUNCTION(.*?);""", re.DOTALL | re.IGNORECASE) self._sql = re.sub(pattern, '', self._sql) # replace 'create function' with 'create or replace function' - pattern = re.compile(r"""CREATE(\s+)FUNCTION""", re.DOTALL | re.IGNORECASE); + pattern = re.compile(r"""CREATE(\s+)FUNCTION""", re.DOTALL | re.IGNORECASE) self._sql = re.sub(pattern, 'CREATE OR REPLACE FUNCTION', self._sql) - """ - @brief Entry function for cleaning the sql script - """ def cleanup(self, sql): + """ + @brief Entry function for cleaning the sql script + """ self._sql = sql self._clean_comment() self._clean_type() diff --git a/src/ports/postgres/modules/regress/linear.py_in b/src/ports/postgres/modules/regress/linear.py_in index a524c653b..38aacaf25 100644 --- a/src/ports/postgres/modules/regress/linear.py_in +++ b/src/ports/postgres/modules/regress/linear.py_in @@ -9,7 +9,7 @@ from utilities.utilities import __unique_string from utilities.validate_args import table_exists from utilities.validate_args import columns_exist_in_table from utilities.validate_args import table_is_empty -from utilities.utilities import _assert +from utilities.utilities import _assert # ---------------------------------------------------------------------- def linregr_train(schema_madlib, source_table, out_table, @@ -20,17 +20,17 @@ def linregr_train(schema_madlib, source_table, out_table, """select setting from pg_settings where name='client_min_messages'""")[0]['setting'] plpy.execute("set client_min_messages to error") - + _validate_args(schema_madlib, source_table, out_table, dependent_varname, independent_varname, grouping_cols, heteroskedasticity_option) group_str = '' if grouping_cols is None else 'GROUP BY %s' % grouping_cols group_str_sel = '' if grouping_cols is None else grouping_cols + ',' - join_str = ',' if grouping_cols is None else 'JOIN' + join_str = ',' if grouping_cols is None else 'JOIN' using_str = '' if grouping_cols is None else 'USING (%s)' % grouping_cols - # Run linear regression + # Run linear regression temp_lin_rst = __unique_string() plpy.execute( """ @@ -46,12 +46,12 @@ def linregr_train(schema_madlib, source_table, out_table, {source_table} {group_str} """.format(schema_madlib=schema_madlib, - temp_lin_rst=temp_lin_rst, - group_str=group_str, - group_str_sel=group_str_sel, - dependent_varname=dependent_varname, - independent_varname=independent_varname, - source_table=source_table)) + temp_lin_rst=temp_lin_rst, + group_str=group_str, + group_str_sel=group_str_sel, + dependent_varname=dependent_varname, + independent_varname=independent_varname, + source_table=source_table)) # Run heteroskedasticity test if heteroskedasticity_option: @@ -65,18 +65,18 @@ def linregr_train(schema_madlib, source_table, out_table, {schema_madlib}.heteroskedasticity_test_linregr( {dependent_varname}, {independent_varname}, - (lin_rst).coef) AS hsk_rst + (lin_rst).coef) AS hsk_rst FROM {source_table} {join_str} {temp_lin_rst} {using_str} {group_str} """.format(schema_madlib=schema_madlib, - temp_hsk_rst=temp_hsk_rst, - dependent_varname=dependent_varname, - independent_varname=independent_varname, - group_str_sel=group_str_sel, group_str=group_str, - join_str=join_str, using_str=using_str, - source_table=source_table, temp_lin_rst=temp_lin_rst)) - + temp_hsk_rst=temp_hsk_rst, + dependent_varname=dependent_varname, + independent_varname=independent_varname, + group_str_sel=group_str_sel, group_str=group_str, + join_str=join_str, using_str=using_str, + source_table=source_table, temp_lin_rst=temp_lin_rst)) + # Output the results join_str = '' using_str = '' @@ -105,7 +105,7 @@ def linregr_train(schema_madlib, source_table, out_table, CASE WHEN (lin.lin_rst).num_processed IS NULL THEN 0 ELSE (lin.lin_rst).num_processed - END AS num_rows_processed, + END AS num_rows_processed, CASE WHEN (lin.lin_rst).num_processed IS NULL THEN lin.num_rows ELSE lin.num_rows - (lin.lin_rst).num_processed @@ -127,14 +127,14 @@ def _validate_args(schema_madlib, source_table, out_table, dependent_varname, @brief validate the arguments """ _assert(source_table is not None and - source_table.strip().lower() not in ('null', ''), + source_table.strip().lower() not in ('null', ''), "Linregr error: Invalid data table name!") _assert(table_exists(source_table), "Linregr error: Data table does not exist!") _assert(not table_is_empty(source_table), "Linregr error: Data table is empty!") - _assert(out_table is not None and + _assert(out_table is not None and out_table.strip().lower() not in ('null', ''), "Linregr error: Invalid output table name!") _assert(not table_exists(out_table), @@ -159,3 +159,110 @@ def _validate_args(schema_madlib, source_table, out_table, dependent_varname, heteroskedasticity_option in (True, False), "Linregr error: Invalid heteroskedasticity_option") +# ------------------------------------------------------------------------------ +# -- Online help function ------------------------------------------------------ +# ------------------------------------------------------------------------------ + +def linregr_help_message(schema_madlib, message, **kwargs): + """ Help message for Linear Regression + + @brief + Args: + @param schema_madlib string, Name of the schema madlib + @param message string, Help message indicator + + Returns: + String. Contain the help message string + """ + if not message: + help_string = """ + ----------------------------------------------------------------------- + SUMMARY + ----------------------------------------------------------------------- + Ordinary Least Squares Regression, also called Linear Regression, is a + statistical model used to fit linear models. + + It models a linear relationship of a scalar dependent variable \f$ y \f$ to one + or more explanatory independent variables \f$ x \f$ to build a + model of coefficients. + + For more details on function usage: + SELECT {schema_madlib}.linregr_train('usage') + + For an example on using the function: + SELECT {schema_madlib}.linregr_train('example') + """ + elif message in ['usage', 'help', '?']: + help_string = """ + ----------------------------------------------------------------------- + USAGE + ----------------------------------------------------------------------- + SELECT {schema_madlib}.linregr_train( + source_table, -- name of input table + out_table, -- name of output table + dependent_varname, -- name of dependent variable + independent_varname, -- name of independent variable + grouping_cols, -- names of columns to group-by + heteroskedasticity_option, -- perform heteroskedasticity test? + ); + + ----------------------------------------------------------------------- + OUTUPT + ----------------------------------------------------------------------- + The output table ('out_table' above) has the following columns + <...>, -- Grouping columns used during training + 'coef' DOUBLE PRECISION[], -- Vector of coefficients + 'r2' DOUBLE PRECISION, -- R-squared coefficient + 'std_err' DOUBLE PRECISION[], -- Standard errors of coefficients + 't_stats' DOUBLE PRECISION[], -- t-stats of the coefficients + 'p_values' DOUBLE PRECISION[], -- p-values of the coefficients + 'condition_no' INTEGER, -- The condition number of the covariance matrix. + 'bp_stats' DOUBLE PRECISION, -- The Breush-Pagan statistic of heteroskedacity. + (if heteroskedasticity_option=TRUE) + 'bp_p_value' DOUBLE PRECISION -- The Breush-Pagan calculated p-value. + (if heteroskedasticity_option=TRUE) + """ + elif message in ['example', 'examples']: + help_string = """ + CREATE TABLE houses (id INT, tax INT, + bedroom INT, bath FLOAT, + price INT, size INT, lot INT); + COPY houses FROM STDIN WITH DELIMITER '|'; + 1 | 590 | 2 | 1 | 50000 | 770 | 22100 + 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 + 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 + 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 + 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 + 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 + 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 + 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 + 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 + 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 + 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 + 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 + 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 + 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 + 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 + \. + + -- Train a regression model. First, single regression for all data. + SELECT {schema_madlib}.linregr_train( 'houses', + 'houses_linregr', + 'price', + 'ARRAY[1, tax, bath, size]' + ); + -- Generate three output models, one for each value of "bedroom". + SELECT {schema_madlib}.linregr_train('houses', + 'houses_linregr_bedroom', + 'price', + 'ARRAY[1, tax, bath, size]', + 'bedroom' + ); + -- Examine the resulting models. + SELECT * FROM houses_linregr; + SELECT * FROM houses_linregr_bedroom; + """ + else: + help_string = "No such option. Use {schema_madlib}.linregr_train()" + + return help_string.format(schema_madlib=schema_madlib) \ No newline at end of file diff --git a/src/ports/postgres/modules/regress/linear.sql_in b/src/ports/postgres/modules/regress/linear.sql_in index c1b7d93b9..5a5d98141 100644 --- a/src/ports/postgres/modules/regress/linear.sql_in +++ b/src/ports/postgres/modules/regress/linear.sql_in @@ -35,9 +35,10 @@ model of coefficients. @anchor train @par Training Function + The linear regression training function has the following syntax.
-linregr_train( source_table, 
+linregr_train( source_table,
                out_table,
                dependent_varname,
                independent_varname,
@@ -45,13 +46,14 @@ linregr_train( source_table,
                heteroskedasticity_option
              )
 
+ \b Arguments
source_table
TEXT. The name of the table containing the training data.
out_table
-
TEXT. Name of the generated table containing the output model. +
TEXT. Name of the generated table containing the output model. The output table contains the following columns. @@ -116,6 +118,19 @@ linregr_train( source_table,
BOOLEAN, default: FALSE. When TRUE, the heteroskedasticity of the model is also calculated and returned with the results.
+@anchor warning +@warning The aggregate 'linregr' has been deprecated in favor of the function +'linregr_train'. If the aggregate 'linregr' is used to output the results of +linear regression to a table, it is recommended to follow the general pattern +shown below (replace text within '<...>' with the appropriate variable names). +
+CREATE TABLE \ AS
+SELECT (r).*
+FROM (
+    SELECT linregr(\, \) as r
+    FROM \
+    ) q;
+
@anchor predict @@ -157,24 +172,24 @@ COPY houses FROM STDIN WITH DELIMITER '|'; 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 \\. --# Train a regression model. First, a single regression for all the data. -
 
+-#  Train a regression model. First, a single regression for all the data.
+
 SELECT madlib.linregr_train( 'houses',
-                             'houses_linregr', 
-                             'price', 
+                             'houses_linregr',
+                             'price',
                              'ARRAY[1, tax, bath, size]'
                            );
-
+
-# Generate three output models, one for each value of "bedroom".
 SELECT madlib.linregr_train( 'houses',
-                             'houses_linregr_bedroom', 
-                             'price', 
+                             'houses_linregr_bedroom',
+                             'price',
                              'ARRAY[1, tax, bath, size]',
                              'bedroom'
                            );
 
--# Examine the resulting models. +-# Examine the resulting models.
 -- Set extended display on for easier reading of output
 \\x ON
@@ -210,7 +225,7 @@ coef         | {0.0112536020318378,41.4132554771633,0.0225072040636757,31.397549
 r2           | 1
 std_err      | {0,0,0,0}
 t_stats      | {Infinity,Infinity,Infinity,Infinity}
-p_values     | 
+p_values     |
 condition_no | Infinity
 -[ RECORD 3 ]+--------------------------------------------------------------------------
 bedroom      | 3
@@ -237,7 +252,7 @@ SELECT houses.*,
        madlib.linregr_predict( ARRAY[1,tax,bath,size],
                                m.coef
                              ) as predict,
-        price - 
+        price -
           madlib.linregr_predict( ARRAY[1,tax,bath,size],
                                   m.coef
                                 ) as residual
@@ -374,7 +389,7 @@ http://en.wikipedia.org/wiki/Heteroscedasticity-consistent_standard_errors
 
 
 @anchor related
-@par Related Topics 
+@par Related Topics
 
 @ref grp_robust
 
@@ -391,6 +406,70 @@ File linear.sql_in, source file for the SQL functions
 @endinternal
 */
 
+---------------------------------------------------------------------------
+-- User facing functions
+---------------------------------------------------------------------------
+/**
+  * @brief Linear regression training function with grouping support.
+ **/
+CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
+    source_table                VARCHAR,    -- name of input  table
+    out_table                   VARCHAR,    -- name of output table
+    dependent_varname           VARCHAR,    -- name of dependent variable
+    independent_varname         VARCHAR,    -- name of independent variable
+    grouping_cols               VARCHAR,    -- names of columns to group-by
+    heteroskedasticity_option   BOOLEAN     -- do heteroskedasticity test or not
+) RETURNS VOID AS $$
+PythonFunction(regress, linear, linregr_train)
+$$ LANGUAGE plpythonu;
+
+CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
+    source_table                VARCHAR,    -- name of input  table
+    out_table                   VARCHAR,    -- name of output table
+    dependent_varname           VARCHAR,    -- name of dependent variable
+    independent_varname         VARCHAR,    -- name of independent variable
+    grouping_cols               VARCHAR     -- names of columns to group-by
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.linregr_train($1, $2, $3, $4, $5, FALSE);
+$$ LANGUAGE sql;
+
+CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
+    source_table                VARCHAR,    -- name of input  table
+    out_table                   VARCHAR,    -- name of output table
+    dependent_varname           VARCHAR,    -- name of dependent variable
+    independent_varname         VARCHAR     -- name of independent variable
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.linregr_train($1, $2, $3, $4, NULL, FALSE);
+$$ LANGUAGE sql;
+---------------------------------------------------------------------------
+
+-----------------------------------------------------------------------
+-- Online help function
+-----------------------------------------------------------------------
+
+----------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train()
+RETURNS VARCHAR AS $$
+BEGIN
+    RETURN MADLIB_SCHEMA.linregr_train('');
+END;
+$$ LANGUAGE plpgsql VOLATILE;
+
+----------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linregr_train(
+     message VARCHAR     -- usage string
+)
+RETURNS VARCHAR AS $$
+PythonFunction(regress, linear, linregr_help_message)
+$$ LANGUAGE plpythonu VOLATILE;
+
+----------------------------------------------------------------------
+
+
+-- Deprecated functions --------------------------------------------------------
+
 ---------------------------------------------------------------------------
 -- Result Types
 ---------------------------------------------------------------------------
@@ -556,42 +635,3 @@ CREATE AGGREGATE MADLIB_SCHEMA.heteroskedasticity_test_linregr(
     INITCOND=''
 );
 ---------------------------------------------------------------------------
-
-
----------------------------------------------------------------------------
--- User facing functions
----------------------------------------------------------------------------
-/**
-  * @brief Linear regression training function with grouping support.
- **/
-CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
-    source_table                VARCHAR,    -- name of input  table
-    out_table                   VARCHAR,    -- name of output table
-    dependent_varname           VARCHAR,    -- name of dependent variable
-    independent_varname         VARCHAR,    -- name of independent variable
-    grouping_cols               VARCHAR,    -- names of columns to group-by
-    heteroskedasticity_option   BOOLEAN     -- do heteroskedasticity test or not
-) RETURNS VOID AS $$
-PythonFunction(regress, linear, linregr_train)
-$$ LANGUAGE plpythonu;
-
-CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
-    source_table                VARCHAR,    -- name of input  table
-    out_table                   VARCHAR,    -- name of output table
-    dependent_varname           VARCHAR,    -- name of dependent variable
-    independent_varname         VARCHAR,    -- name of independent variable
-    grouping_cols               VARCHAR     -- names of columns to group-by
-) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.linregr_train($1, $2, $3, $4, $5, FALSE);
-$$ LANGUAGE sql;
-
-CREATE FUNCTION MADLIB_SCHEMA.linregr_train(
-    source_table                VARCHAR,    -- name of input  table
-    out_table                   VARCHAR,    -- name of output table
-    dependent_varname           VARCHAR,    -- name of dependent variable
-    independent_varname         VARCHAR     -- name of independent variable
-) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.linregr_train($1, $2, $3, $4, NULL, FALSE);
-$$ LANGUAGE sql;
----------------------------------------------------------------------------
-
diff --git a/src/ports/postgres/modules/regress/test/linear.sql_in b/src/ports/postgres/modules/regress/test/linear.sql_in
index 35bb37daa..68ae5d15a 100644
--- a/src/ports/postgres/modules/regress/test/linear.sql_in
+++ b/src/ports/postgres/modules/regress/test/linear.sql_in
@@ -51,6 +51,7 @@ SELECT assert(
 ) q;
 
 
+
 /*
  * The following example is taken from:
  * http://biocomp.health.unm.edu/biomed505/Course/Cheminformatics/advanced/data_classification_qsar/linear_multilinear_regression.pdf
@@ -155,10 +156,6 @@ SELECT assert(
     ) AS linregr
 ) ignored;
 
-
-
-
-
 ------------------------------------------------------------------------
 
 select linregr_train('houses', 'result_lin_houses', 'price',
@@ -194,3 +191,7 @@ from result_lin_houses
 where bedroom = 3;
 
 drop table if exists result_lin_houses cascade;
+
+select linregr_train();
+select linregr_train('usage');
+select linregr_train('example');
diff --git a/src/ports/postgres/modules/regress/test/logistic.sql_in b/src/ports/postgres/modules/regress/test/logistic.sql_in
index a4f1c81a7..77e3a8709 100644
--- a/src/ports/postgres/modules/regress/test/logistic.sql_in
+++ b/src/ports/postgres/modules/regress/test/logistic.sql_in
@@ -727,7 +727,7 @@ INSERT INTO all_null_patients(id, second_attack, treatment, trait_anxiety) VALUE
 (1, NULL, 0, 60);
 
 drop table if exists temp_result;
-select madlib.logregr_train(
+select logregr_train(
     'all_null_patients',
     'temp_result',
     'second_attack',