From 5b32cb2f2c6af1fc18ea0856a966d23c402f0cbd Mon Sep 17 00:00:00 2001 From: Alexander Dunkel Date: Fri, 11 Dec 2020 15:13:38 +0100 Subject: [PATCH] Update docs with more examples --- docs/input-mappings.md | 110 +++++++++++++++++++++++++++++++++++++ docs/input-types.md | 47 ++++++++++++++++ docs/output-mappings.md | 117 ++++++++++++++++++++++++++++++++++++++++ docs/package.md | 14 +++++ docs/use-cases.md | 22 ++++++++ 5 files changed, 310 insertions(+) create mode 100644 docs/input-mappings.md create mode 100644 docs/input-types.md create mode 100644 docs/output-mappings.md create mode 100644 docs/package.md create mode 100644 docs/use-cases.md diff --git a/docs/input-mappings.md b/docs/input-mappings.md new file mode 100644 index 0000000..466b8d0 --- /dev/null +++ b/docs/input-mappings.md @@ -0,0 +1,110 @@ +For any conversion, a mapping must exist. A mapping is defined in +a python file (`.py`) and describes how any input data is converted +to the [common lbsn structure](https://lbsn.vgiscience.org/), which +is available from the Python version of the Proto Buf Spec. + +Mappings are loaded dynamically. You can provide a path to a folder +containing mappings with the flag `--mappings_path ./subfolder`. + +If no path is provided, `lbsn raw` is assumed as input, for which +the file mapping is available in [lbsntransform/input/field_mapping_lbsn.py](/api/input/mappings/field_mapping_lbsn.html), +including lbsn db query syntax defined in [lbsntransform/input/db_query.py](/api/input/mappings/db_query.html). + +Predefined mappings exist for Flickr (CSV/JSON) and Twitter (JSON) +in the [resources folder](https://gitlab.vgiscience.de/lbsn/lbsntransform/resources). +If the git repository is cloned to a local folder, use +`--mappings_path ./resources/mappings/` to load Flickr or Twitter mappings. + +Input mappings must have some specific attributes to be recognized. + +Primarily, a class constant "MAPPING_ID" is used to load mappings, +e.g. the [field_mapping_lbsn.py](/api/input/mappings/field_mapping_lbsn.html) +has the following module level constant: +```py +MAPPING_ID = 0 +``` + +**Examples:** + +To load data with the default mapping, use `lbsntransform --origin 0`. + +To load data from Twitter json, use use +```bash +lbsntransform --origin 3 \ + --mappings_path ./resources/mappings/ \ + --file_input \ + --file_type "json" +``` + +To load data from Flickr YFCC100M, use use + +```bash +lbsntransform --origin 21 \ + --mappings_path ./resources/mappings/ \ + --file_input \ + --file_type "csv" \ + --csv_delimiter $'\t' +``` + +# Custom Input Mappings + +Start with any of the predefined mappings, either from [field_mapping_lbsn.py](/api/input/mappings/field_mapping_lbsn.html), +or [field_mapping_twitter.py](https://gitlab.vgiscience.de/lbsn/lbsntransform/resources/field_mapping_twitter.py) (JSON) and +[field_mapping_yfcc100m.py](https://gitlab.vgiscience.de/lbsn/lbsntransform/resources/field_mapping_yfcc100m.py) (CSV). + +A minimal template looks as follows: + +```py +# -*- coding: utf-8 -*- + +""" +Module for mapping example Posts dataset to common LBSN Structure. +""" + +from typing import Optional +from lbsnstructure import lbsnstructure_pb2 as lbsn +from lbsntransform.tools.helper_functions import HelperFunctions as HF + +MAPPING_ID = 99 + +class importer(): + """ Provides mapping function from Example Post Source to + protobuf lbsnstructure + """ + ORIGIN_NAME = "Example Post Source" + ORIGIN_ID = 2 + + def __init__(self, + disable_reaction_post_referencing=False, + geocodes=False, + map_full_relations=False, + map_reactions=True, + ignore_non_geotagged=False, + ignore_sources_set=None, + min_geoaccuracy=None): + origin = lbsn.Origin() + origin.origin_id = 99 + self.origin = origin + self.null_island = 0 + self.skipped_count = 0 + self.skipped_low_geoaccuracy = 0 + + def parse_csv_record(self, record, record_type: Optional[str] = None): + """Entry point for processing CSV data: + Attributes: + record A single row from CSV, stored as list type. + """ + # extract/convert all lbsn records + lbsn_records = self.extract_post(record) + return lbsn_records + + def extract_post(self, record): + post_record = HF.new_lbsn_record_with_id( + lbsn.Post(), post_guid, self.origin) +``` + +!!! Note + For one lbsn origin, many mappings may exist. For example, + for the above example origin with id "99", you may have + mappings with ids 991, 992, 993 etc. This can be used to + create separate mappings for json, csv etc. \ No newline at end of file diff --git a/docs/input-types.md b/docs/input-types.md new file mode 100644 index 0000000..53830d6 --- /dev/null +++ b/docs/input-types.md @@ -0,0 +1,47 @@ +# Input type: file, url, or database? + +lbsntransform can read data from different common types of data sources: + +The following cli arguments are available: + +* file input `--file_input` + * json files `--file_type json` + * stacked `--is_stacked_json` + The typical form for json is `[{json1},{json2}]`. If `--is_stacked_json` is set, + jsons in the form of `{json1}{json2}` (no comma) can be imported. + * line separated `--is_line_separated_json` + If this flag is used, lbsntransform expects one json per line (separated with a line break). + * csv files `--file_type csv` + * Set CSV delimiter with `--csv_delimiter`, common types are e.g.: + * Comma: `','` (default) + * Semi-colon: `';'` + * Tab: `$'\t'` + * Additional flags for file input: + * `--input_path_url` the folder, path or url to read from, e.g.: + * `--input_path_url 01_Input` Read from the relative subfolder "01_Input" (default). + * `--input_path_url ~/data/` Read from the user's home folder "data". + * `--input_path_url /c/tmp/data` Read from a WSL mounted subdir from Windows. + * "/d/03_EvaVGI/01_Daten/02_FlickrCommons/Flickr_Commons_100Million_YFCC100M_dataset/" \ + * `--recursive_load` to recursively process local sub directories (default depth: 2). + * `--skip_until_file x` to process all files until a file name with name `x` is found + * `--zip_records` Allows to zip records from multiple sources using semi-colon (`;`), e.g.: + * `--input_path_url "https://mypage.org/dataset_col1.csv;https://mypage.org/dataset_col2.csv"` + Will process records from both csv files parallel, by zipping files. +* data base input (Postgres) + * `--dbuser_input "postgres"` the name of the dbuser + * `--dbserveraddress_input "127.0.0.1:5432"` the name and (optional) the port to use. The default postgres port is `5432`. + * `--dbname_input "rawdb"` the name of the database. + * `--dbpassword_input "mypw` the password to use when connecting. + * `--dbformat_input "lbsn"` the format of the database. Currently, only "lbsn" and "json" are supported. + * Additional flags for db input: + - `--records_tofetch 1000` If retrieving from a db, limit the + number of records to fetch per batch. Defaults to 10k. + - `--startwith_db_rownumber xyz` To resume processing from an arbitrary ID. + If input db type is "LBSN", provide the primary key to start from (e.g. post_guid, place_guid etc.). + This flag will only work if processing a single lbsnObject (e.g. lbsnPost). + - `--endwith_db_rownumber xyz` To stop processing at a particular row-id. + - `--include_lbsn_objects` If processing from lbsn rawdb, provide a comma separated list of + [lbsn objects](https://lbsn.vgiscience.org/structure/) to include. May contain: + origin,country,city,place,user_groups,user,post,post_reaction,event + Excluded objects will not be queried, but empty objects may be created due to referenced + foreign key relationships. Defaults to origin,post. \ No newline at end of file diff --git a/docs/output-mappings.md b/docs/output-mappings.md new file mode 100644 index 0000000..63b8917 --- /dev/null +++ b/docs/output-mappings.md @@ -0,0 +1,117 @@ +**lbsntransform** can output data to a database with the [common lbsn structure](), +called [rawdb](https://gitlab.vgiscience.de/lbsn/databases/rawdb) +or the privacy-aware version, called [hlldb](https://gitlab.vgiscience.de/lbsn/databases/hllb). + +**Examples:** + +To output data to rawdb: + +```bash +lbsntransform --dbpassword_output "sample-key" \ + --dbuser_output "postgres" \ + --dbserveraddress_output "127.0.0.1:5432" \ + --dbname_output "rawdb" \ + --dbformat_output "lbsn" +``` + +The syntax for conversion to hlldb is a little bit more complex, +since the output structure may vary to a large degree, depending +on each use case. + +!!! note + The hlldb and structure are still in an early stage of development. + We're beyond the initial proof of concept and are working on + simplifying custom mappings. + +To output data to hlldb: +```bash +lbsntransform --dbpassword_output "sample-key" \ + --dbuser_output "postgres" \ + --dbserveraddress_output "127.0.0.1:25432" \ + --dbname_output "hlldb" \ + --dbformat_output "hll" \ + --dbpassword_hllworker "sample-key" \ + --dbuser_hllworker "postgres" \ + --dbserveraddress_hllworker "127.0.0.1:15432" \ + --dbname_hllworker "hllworkerdb" \ + --include_lbsn_objects "origin,post" \ +``` + +Above, a separate connection to a "hll_worker" database is provided. +It is used to make hll calculations (union, hashing etc.). No items +will be written to this database, a read_only user will suffice. A +[Docker container with a predefined user](https://gitlab.vgiscience.de/lbsn/databases/pg-hll-empty) +is available. + +Having two hll databases, one for calculations and one for storage means +that concerns can be separated: There is no need for hlldb to receive any +raw data. Likewise, the hll worker does not need to know contextual data, +for union of specific hll sets. Such a setup improves rubustness and privacy. +It further allows to separate processing into individual components. + +If no hll worker is available, hlldb may be used. + +Use `--include_lbsn_objects` to specify which input data you want to convert to +the privacy aware version. For example, `--include_lbsn_objects "origin,post"` +would process [lbsn objects](https://lbsn.vgiscience.org/structure/) +of type origin and post (default). + +Use `--include_lbsn_bases` to specify which output data you want to convert to. + +We call this "bases", and they are defined in output mappings in +[lbsntransform/input/field_mapping_lbsn.py](/api/output/hll/hll_bases.html), + +Bases can be separated by comma and may include: + +- Temporal Facet: + - `monthofyear` + - `month` + - `dayofmonth` + - `dayofweek` + - `hourofday` + - `year` + - `month` + - `date` + - `timestamp` + +- Spatial Facet: + - `country` + - `region` + - `city` + - `place` + - `latlng` + +- Social Facet: + - `community` + +- Topical Facet: + - `hashtag` + - `emoji` + - `term` + +- Composite Bases: + - `_hashtag_latlng` + - `_term_latlng` + - `_emoji_latlng` + + +For example: +```bash +lbsntransform --include_lbsn_bases hashtag,place,date,community +``` + +would fill/update entries of the hlldb structures: +- topical.hashtag +- spatial.place +- temporal.date +- social.community + +This name refers to `schema.table`. + +It is possible to define own output hll db mappings. The best place +to start is [lbsntransform/input/field_mapping_lbsn.py](/api/output/hll/hll_bases.html). +Have a look at the pre-defined bases and add any additional needed. It is recommended +to use inheritance. After adding your own mappings, the hlldb must be prepared with +respective table structures. Have a look at the +[predefined structures available](https://gitlab.vgiscience.de/lbsn/structure/hlldb). + diff --git a/docs/package.md b/docs/package.md new file mode 100644 index 0000000..15053d8 --- /dev/null +++ b/docs/package.md @@ -0,0 +1,14 @@ +For in-memory conversion, it is possible to import lbsntransform as a package: + +```py +import lbsntransform as lt +lt.add_processed_records( + record) +lt.store_lbsn_records() +``` + +As a starting point, have a look at +[lbsntransform/__main__.py](https://gitlab.vgiscience.de/lbsn/lbsntransform/-/blob/master/lbsntransform/__main__.py), +which includes the code that is invoked on command line use. + +We plan to update this section with a Jupyter Lab example notebook. \ No newline at end of file diff --git a/docs/use-cases.md b/docs/use-cases.md new file mode 100644 index 0000000..c69f930 --- /dev/null +++ b/docs/use-cases.md @@ -0,0 +1,22 @@ +If you're using the command line interface, a common usage of lbsntransform is to +import/convert arbitrary social media data, e.g. from Flickr or Twitter, to a Postgres Database +with the [common lbsn structure](https://lbsn.vgiscience.org/) + +The following use cases exist: + +1. importing lbsntransform as a package + + Use this approach to convert data, such as individual posts + retrieved from an API, on-the-fly (in-memory), in your own + python package. + +2. using the command line interface (cli) to perform batch conversions + + Use this approach if you want to convert batches of data stored as + arbitrary json/csv files, or if you want to convert from a database + with the raw lbsn structure to a database with the privacy-aware hll + format. + +For any conversion, +- the input type must be provided, see [input-types](input-types) +- a mapping must exist, see [input-mappings](input-mappings) \ No newline at end of file