From 2a62e5ac7c9461f151efa7d9fc95227bc30ac4c2 Mon Sep 17 00:00:00 2001 From: mikeqfu Date: Tue, 15 Nov 2022 10:51:48 +0000 Subject: [PATCH] Fix bugs in the class `Stations` --- pyrcs/other_assets/station.py | 207 +++++++++++++++++++++++----------- 1 file changed, 144 insertions(+), 63 deletions(-) diff --git a/pyrcs/other_assets/station.py b/pyrcs/other_assets/station.py index 91c040c..b1f1555 100644 --- a/pyrcs/other_assets/station.py +++ b/pyrcs/other_assets/station.py @@ -67,13 +67,13 @@ def __init__(self, data_dir=None, update=False, verbose=True): print_conn_err(verbose=verbose) - self.catalogue = self._get_station_data_catalogue(update=update, verbose=False) + self.catalogue = self.get_catalogue(update=update, verbose=False) self.last_updated_date = get_last_updated_date(url=self.URL, parsed=True, as_date_type=False) self.data_dir, self.current_data_dir = init_data_dir(self, data_dir, category="other-assets") - def _get_station_data_catalogue(self, update=False, verbose=False): + def get_catalogue(self, update=False, verbose=False): """ Get catalogue of railway station data. @@ -90,7 +90,7 @@ def _get_station_data_catalogue(self, update=False, verbose=False): >>> stn = Stations() - >>> stn_data_cat = stn._get_station_data_catalogue() + >>> stn_data_cat = stn.get_catalogue() >>> type(stn_data_cat) dict >>> list(stn_data_cat.keys()) @@ -135,8 +135,7 @@ def _get_station_data_catalogue(self, update=False, verbose=False): hot_soup = { a.text: urllib.parse.urljoin(self.URL, a.get('href')) - for a in cold_soup.find_all('a') - } + for a in cold_soup.find_all('a')} catalogue = {} for k, v in hot_soup.items(): @@ -223,11 +222,11 @@ def check_row_spans(dat): temp_data = pd.concat(temp_dat, axis=1) temp_data = temp_data[dat.columns.to_list()] - dat_ = pd.concat([dat.drop(index=temp1.index), temp_data], axis=0, ignore_index=True) + dat = pd.concat([dat.drop(index=temp1.index), temp_data], axis=0, ignore_index=True) - dat_.sort_values(['Station'], ignore_index=True, inplace=True) + dat.sort_values(['Station'], ignore_index=True, inplace=True) - return dat_ + return dat @staticmethod def parse_coordinates_columns(dat): @@ -240,13 +239,12 @@ def parse_coordinates_columns(dat): :rtype: pandas.DataFrame """ - dat_ = dat.copy() ll_col_names = ['Degrees Longitude', 'Degrees Latitude'] - dat_.loc[:, ll_col_names] = dat_[ll_col_names].applymap( + dat[ll_col_names] = dat[ll_col_names].applymap( lambda x: None if x.strip() == '' else float(re.sub(r'(c\.)|≈', '', x))) - return dat_ + return dat @staticmethod def parse_station_column(dat): @@ -264,78 +262,111 @@ def parse_station_column(dat): x = 'Heathrow Junction [sometimes referred to as Heathrow Interchange]\t\t / [no CRS?]' """ - dat_ = dat.copy() - - temp1 = dat_['Station'].str.split('\t\t', expand=True) + temp1 = dat['Station'].str.split('\t\t', expand=True) temp1.columns = ['Station', 'CRS'] - dat_.loc[:, 'Station'] = temp1['Station'] + dat['Station'] = temp1['Station'] # Get notes for stations - stn_note_ = pd.Series('', index=dat_.index) + stn_note_ = pd.Series('', index=dat.index) for i, x in enumerate(temp1['Station']): if '[' in x and ']': y = re.search(r' \[(.*)]', x).group(0) # Station Note - dat_.loc[i, 'Station'] = x.replace(y, '') + dat.loc[i, 'Station'] = x.replace(y, '') stn_note_[i] = y.strip(' []') - dat_.insert(loc=dat_.columns.get_loc('Station') + 1, column='Station Note', value=stn_note_) + dat.insert(loc=dat.columns.get_loc('Station') + 1, column='Station Note', value=stn_note_) + + temp2 = temp1['CRS'].str.replace(' / /', ' &&& ').str.split(' / ', expand=True).fillna('') - temp2 = temp1['CRS'].str.split(' / ', expand=True).fillna('') if temp2.shape[1] == 1: temp2.columns = ['CRS'] temp2 = pd.concat([temp2, pd.DataFrame('', index=temp2.index, columns=['CRS Note'])], axis=1) else: temp2.columns = ['CRS', 'CRS Note'] - temp2.loc[:, 'CRS Note'] = temp2['CRS Note'].str.strip('[]') + temp2['CRS Note'] = temp2['CRS Note'].str.strip('[]') - temp2.loc[:, 'CRS'] = temp2['CRS'].str.replace(r'[()]', '', regex=True) + temp2['CRS'] = temp2['CRS'].str.replace(r'[()]', '', regex=True).map( + lambda z: ' and '.join(['{} [{}]'.format(*z_.split('✖')) for z_ in z.split(' &&& ')]) + if ' &&& ' in z else z) - dat_ = pd.concat([dat_, temp2], axis=1) + dat = pd.concat([dat, temp2], axis=1) - return dat_ + return dat @staticmethod - def parse_owner_and_operator_columns(dat): + def _parse_owner_and_operator(x, sep=' / '): """ - Parse ``'Owner'`` and ``'Operator'`` of the station locations data. - - :param dat: preprocessed data of the station locations - :type dat: pandas.DataFrame - :return: data with parsed information of owners and operators - :rtype: pandas.DataFrame + x = dat['Owner'][0] + x = dat['Owner'][1] """ - def _parse_owner_and_operator(x, sep=' / '): - """ - x = dat_['Owner'][0] - """ + if ' / and / ' in x: + y, y_ = x.replace(' / and / ', ' &&& '), '' + + elif ' / ' in x: x_ = x.split(sep) + # y - Owners or operators; y_ - Former owners or operators if len(x_) > 1: - owner_or_operator = x_[0] - if len(x_[1:]) > 1: - former_owners_or_operators = sep.join(x_[1:]) - else: - former_owners_or_operators = x_[1:] + y = x_[0] + y_ = x_[1] if len(x_[1:]) == 1 else sep.join(x_[1:]) else: - owner_or_operator, former_owners_or_operators = x_[0], '' + y, y_ = x_[0], '' - return owner_or_operator, former_owners_or_operators + else: + y, y_ = x, '' + + if '✖' in y and ' &&& ' in y: + y = ' and '.join(['{} [{}]'.format(*z.split('✖')) for z in y.split(' &&& ')]) - dat_ = dat.copy() + return y, y_ + + def parse_owner_and_operator_columns(self, dat): + """ + Parse ``'Owner'`` and ``'Operator'`` of the station locations data. + + :param dat: preprocessed data of the station locations + :type dat: pandas.DataFrame + :return: data with parsed information of owners and operators + :rtype: pandas.DataFrame + """ owner_operator = [] for col in ['Owner', 'Operator']: temp = pd.DataFrame( - dat_[col].map(_parse_owner_and_operator).to_list(), columns=[col, 'Former ' + col]) + dat[col].map(self._parse_owner_and_operator).to_list(), columns=[col, 'Former ' + col]) + del dat[col] + owner_operator.append(temp) - del dat_[col] + dat = pd.concat([dat] + owner_operator, axis=1) - owner_operator.append(temp) + return dat + + @staticmethod + def parse_elr_mileage_columns(dat): + """ + Parse ``'ELR'`` and ``'Mileage'`` of the station locations data. - dat_ = pd.concat([dat_] + owner_operator, axis=1) + :param dat: preprocessed data of the station locations + :type dat: pandas.DataFrame + :return: data with parsed ``'ELR'`` and ``'Mileage'`` + :rtype: pandas.DataFrame + """ - return dat_ + dat['Mileage'] = dat['Mileage'].map(lambda x: ']'.join(x.replace(' / (', ' [').rsplit(')', 1))) + + em_col_names = ['ELR', 'Mileage'] + dat[em_col_names] = dat[em_col_names].applymap( + lambda x: x.replace(' / / ', ' / ').replace(' / [', ' [').split(' / ') + if ' / ' in x else x) + + # Where the Mileage data indicates the start and end + idx = dat[dat[em_col_names].apply( + lambda x: len(x.Mileage) != len(x.ELR) and isinstance(x.Mileage, list), + axis=1)].index + dat.loc[idx, 'Mileage'] = dat.loc[idx, 'Mileage'].map(lambda x: ' - '.join(x)) + + return dat def collect_locations_by_initial(self, initial, update=False, verbose=False): """ @@ -369,13 +400,35 @@ def collect_locations_by_initial(self, initial, update=False, verbose=False): >>> type(stn_locations_a_codes) pandas.core.frame.DataFrame >>> stn_locations_a_codes.head() - Station ... Former Operator - 0 Abbey Wood ... London & South Eastern Railway from 1 April 20... - 1 Aber ... Keolis Amey Operations/Gweithrediadau Keolis A... - 2 Abercynon North ... [Cardiff Railway Company from 13 October 1996 ... - 3 Abercynon ... Keolis Amey Operations/Gweithrediadau Keolis A... - 4 Aberdare ... Keolis Amey Operations/Gweithrediadau Keolis A... + Station ... Former Operator + 0 Abbey Wood ... London & South Eastern Railway from 1 April 20... + 1 Abbey Wood ... London & South Eastern Railway from 1 April 20... + 2 Aber ... Keolis Amey Operations/Gweithrediadau Keolis A... + 3 Abercynon ... Keolis Amey Operations/Gweithrediadau Keolis A... + 4 Abercynon ... Keolis Amey Operations/Gweithrediadau Keolis A... [5 rows x 14 columns] + >>> stn_locations_a_codes.columns.to_list() + ['Station', + 'Station Note', + 'ELR', + 'Mileage', + 'Status', + 'Degrees Longitude', + 'Degrees Latitude', + 'Grid Reference', + 'CRS', + 'CRS Note', + 'Owner', + 'Former Owner', + 'Operator', + 'Former Operator'] + >>> stn_locations_a_codes[['Station', 'ELR', 'Mileage']].head() + Station ELR Mileage + 0 Abbey Wood NKL 11m 43ch + 1 Abbey Wood XRS 24.458km + 2 Aber CAR 8m 69ch + 3 Abercynon CAM 16m 28ch + 4 Abercynon ABD 16m 28ch """ beginning_with = validate_initial(initial) @@ -419,19 +472,25 @@ def collect_locations_by_initial(self, initial, update=False, verbose=False): ths = [re.sub(r'\n?\r+\n?', ' ', h.text).strip() for h in thead.find_all('th')] dat = parse_tr(trs=trs, ths=ths, as_dataframe=True) + dat_ = dat.copy() + parser_funcs = [ self.check_row_spans, self.parse_coordinates_columns, self.parse_station_column, self.parse_owner_and_operator_columns, + self.parse_elr_mileage_columns, ] for parser_func in parser_funcs: - dat = parser_func(dat) + dat_ = parser_func(dat_) - data[beginning_with] = dat + # Explode by ELR and Mileage + dat_ = dat_.explode(column=['ELR', 'Mileage'], ignore_index=True) - last_updated_date = get_last_updated_date(url=url, parsed=True) - data[self.KEY_TO_LAST_UPDATED_DATE] = last_updated_date + data = { + beginning_with: dat_.sort_values('Station', ignore_index=True), + self.KEY_TO_LAST_UPDATED_DATE: get_last_updated_date(url=url, parsed=True) + } if verbose == 2: print("Done.") @@ -479,13 +538,35 @@ def fetch_locations(self, update=False, dump_dir=None, verbose=False): >>> type(stn_location_codes_dat) pandas.core.frame.DataFrame >>> stn_location_codes_dat.head() - Station ... Former Operator - 0 Abbey Wood ... London & South Eastern Railway from 1 April 20... - 1 Aber ... Keolis Amey Operations/Gweithrediadau Keolis A... - 2 Abercynon ... Keolis Amey Operations/Gweithrediadau Keolis A... - 3 Abercynon North ... [Cardiff Railway Company from 13 October 1996 ... - 4 Aberdare ... Keolis Amey Operations/Gweithrediadau Keolis A... + Station ... Former Operator + 0 Abbey Wood ... London & South Eastern Railway from 1 April 20... + 1 Abbey Wood ... London & South Eastern Railway from 1 April 20... + 2 Aber ... Keolis Amey Operations/Gweithrediadau Keolis A... + 3 Abercynon ... Keolis Amey Operations/Gweithrediadau Keolis A... + 4 Abercynon ... Keolis Amey Operations/Gweithrediadau Keolis A... [5 rows x 14 columns] + >>> stn_location_codes_dat.columns.to_list() + ['Station', + 'Station Note', + 'ELR', + 'Mileage', + 'Status', + 'Degrees Longitude', + 'Degrees Latitude', + 'Grid Reference', + 'CRS', + 'CRS Note', + 'Owner', + 'Former Owner', + 'Operator', + 'Former Operator'] + >>> stn_location_codes_dat[['Station', 'ELR', 'Mileage']].head() + Station ELR Mileage + 0 Abbey Wood NKL 11m 43ch + 1 Abbey Wood XRS 24.458km + 2 Aber CAR 8m 69ch + 3 Abercynon CAM 16m 28ch + 4 Abercynon ABD 16m 28ch """ verbose_1 = collect_in_fetch_verbose(data_dir=dump_dir, verbose=verbose)