From 5705090610477f04dd3bb208776cedd969e25db3 Mon Sep 17 00:00:00 2001 From: Greg Wilson Date: Thu, 27 Feb 2014 10:15:46 -0500 Subject: [PATCH] * Getting rid of old cheat sheets * Indenting error output cells * Removing explicit 'level' keys from Markdown files * Storing the generated files so that people who don't use those tools won't have to regenerate them. * Modifying .gitignore to reflect this. * Updating the Makefile to run Jekyll exactly once. * Command to install on the server * No longer worrying about making a page of images * Better (more guessable) name for the target that builds the website * Fixing image paths * Fixing up glossary entries --- .gitignore | 25 +- Makefile | 247 +++----- _includes/cheat-sheets/sql.md | 132 ----- _layouts/book.html | 52 ++ _templates/book.tpl | 61 -- bib.md | 7 +- bin/make-book.py | 64 +- bin/make-image-page.py | 42 -- cached/novice/python/01-numpy.md | 709 ++++++++++++++++++++++ cached/novice/python/02-func.md | 850 +++++++++++++++++++++++++++ cached/novice/python/03-loop.md | 423 +++++++++++++ cached/novice/python/04-cond.md | 698 ++++++++++++++++++++++ cached/novice/python/05-defensive.md | 420 +++++++++++++ cached/novice/python/06-cmdline.md | 660 +++++++++++++++++++++ cached/novice/sql/01-select.md | 532 +++++++++++++++++ cached/novice/sql/02-sort-dup.md | 251 ++++++++ cached/novice/sql/03-filter.md | 274 +++++++++ cached/novice/sql/04-calc.md | 272 +++++++++ cached/novice/sql/05-null.md | 223 +++++++ cached/novice/sql/06-agg.md | 371 ++++++++++++ cached/novice/sql/07-join.md | 288 +++++++++ cached/novice/sql/08-create.md | 102 ++++ cached/novice/sql/09-prog.md | 136 +++++ css/lesson.css | 1 + gloss.md | 6 +- novice/extras/01-branching.md | 1 - novice/extras/02-review.md | 1 - novice/extras/03-permissions.md | 1 - novice/extras/04-shellvar.md | 1 - novice/extras/05-ssh.md | 1 - novice/extras/06-alias.md | 1 - novice/extras/07-exceptions.md | 3 +- novice/extras/08-unit.md | 1 - novice/extras/09-debugger.md | 1 - novice/extras/10-numbers.md | 1 - novice/extras/11-human.md | 1 - novice/extras/12-why.md | 1 - novice/extras/fixme-man.md | 1 - novice/extras/index.md | 1 - novice/git/00-intro.md | 1 - novice/git/01-backup.md | 1 - novice/git/02-collab.md | 1 - novice/git/03-conflict.md | 1 - novice/git/04-open.md | 1 - novice/git/index.md | 1 - novice/python/index.md | 1 - novice/r/README.md | 1 - novice/ref/01-shell.md | 1 - novice/ref/02-git.md | 1 - novice/ref/03-python.md | 1 - novice/ref/04-sql.md | 1 - novice/ref/index.md | 1 - novice/shell/00-intro.md | 1 - novice/shell/01-filedir.md | 1 - novice/shell/02-create.md | 1 - novice/shell/03-pipefilter.md | 1 - novice/shell/04-loop.md | 1 - novice/shell/05-script.md | 1 - novice/shell/06-find.md | 1 - novice/shell/index.md | 1 - novice/sql/index.md | 1 - novice/teaching/01-shell.md | 1 - novice/teaching/02-git.md | 1 - novice/teaching/03-python.md | 1 - novice/teaching/04-sql.md | 1 - novice/teaching/index.md | 1 - 66 files changed, 6410 insertions(+), 479 deletions(-) delete mode 100644 _includes/cheat-sheets/sql.md create mode 100644 _layouts/book.html delete mode 100644 _templates/book.tpl delete mode 100755 bin/make-image-page.py create mode 100644 cached/novice/python/01-numpy.md create mode 100644 cached/novice/python/02-func.md create mode 100644 cached/novice/python/03-loop.md create mode 100644 cached/novice/python/04-cond.md create mode 100644 cached/novice/python/05-defensive.md create mode 100644 cached/novice/python/06-cmdline.md create mode 100644 cached/novice/sql/01-select.md create mode 100644 cached/novice/sql/02-sort-dup.md create mode 100644 cached/novice/sql/03-filter.md create mode 100644 cached/novice/sql/04-calc.md create mode 100644 cached/novice/sql/05-null.md create mode 100644 cached/novice/sql/06-agg.md create mode 100644 cached/novice/sql/07-join.md create mode 100644 cached/novice/sql/08-create.md create mode 100644 cached/novice/sql/09-prog.md diff --git a/.gitignore b/.gitignore index e3d805bee..3a9cff4b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,26 +1,7 @@ -*~ *.pyc +*~ .DS_Store .ipynb_checkpoints _site -tmp -image-page.html -tmp-book.* -patched-gloss.md -*/*/*_files -lessons/misc-r/full-R-bootcamp/R-basics/01-basics-of-R.md -lessons/misc-r/full-R-bootcamp/R-basics/rstudio-basics.md -lessons/misc-r/full-R-bootcamp/R-basics/02-data-structures.md -lessons/misc-r/full-R-bootcamp/R-basics/03-best-practices.md -lessons/misc-r/full-R-bootcamp/R-basics/04-seeking-help.md -lessons/misc-r/full-R-bootcamp/R-basics/05-subsetting.md -lessons/misc-r/full-R-bootcamp/R-basics/06-vectorization.md -lessons/misc-r/full-R-bootcamp/R-basics/best-practices.md -lessons/misc-r/full-R-bootcamp/functions/01-functions.md -lessons/misc-r/full-R-bootcamp/functions/02-control_structures.md -lessons/misc-r/full-R-bootcamp/functions/03-scoping_rules.md -lessons/misc-r/full-R-bootcamp/data-manipulation/00-messy_data.md -lessons/misc-r/full-R-bootcamp/data-manipulation/01-input-output.md -lessons/misc-r/full-R-bootcamp/data-manipulation/02-apply-family.md -lessons/misc-r/full-R-bootcamp/data-manipulation/03-split-apply.md -lessons/misc-r/full-R-bootcamp/testing-documentation/testing.md +book.md +cached/novice/*/*_files diff --git a/Makefile b/Makefile index 9922e5798..08e3b3fb9 100644 --- a/Makefile +++ b/Makefile @@ -1,191 +1,122 @@ -#----------------------------------------------------------- +#====================================================================== # Re-make lecture materials. -#----------------------------------------------------------- +#====================================================================== # Directories. -OUT = _site -LINK_OUT = /tmp/bc-links -BOOK = _book -INSTALL_DIR = $(HOME)/sites/software-carpentry.org/v5 - -# Source Markdown pages. -MARKDOWN_SRC = \ - LICENSE.md \ - contents.md \ - bib.md \ - gloss.md \ - rules.md \ - setup.md \ - team.md \ - intro.md \ - $(sort $(wildcard novice/shell/*.md)) \ - $(sort $(wildcard novice/git/*.md)) \ - $(sort $(wildcard novice/python/*.md)) \ - $(sort $(wildcard novice/sql/*.md)) \ - $(sort $(wildcard novice/extras/*.md)) \ - $(sort $(wildcard novice/teaching/*.md)) \ - $(sort $(wildcard novice/ref/*.md)) - -NOTEBOOK_SRC = \ - $(sort $(wildcard novice/python/??-*.ipynb)) \ - $(sort $(wildcard novice/sql/??-*.ipynb)) - -# Slides. -SLIDES_SRC = \ - $(sort (wildcard slides/*.html)) -SLIDES_DST = \ - $(patsubst %.html,$(OUT)/%.html,$(SLIDES_SRC)) - -NOTEBOOK_MD = \ - $(patsubst %.ipynb,%.md,$(NOTEBOOK_SRC)) - -HTML_DST = \ - $(patsubst %.md,$(OUT)/%.html,$(MARKDOWN_SRC)) \ - $(patsubst %.md,$(OUT)/%.html,$(NOTEBOOK_MD)) - -# Source for book (in order, with some substutitions). -BOOK_SRC = \ - intro.md \ - team.md \ - novice/shell/index.md $(sort $(wildcard novice/shell/??-*.md)) \ - novice/git/index.md $(sort $(wildcard novice/git/??-*.md)) \ - novice/python/index.md $(patsubst %.ipynb,%.md,$(sort $(wildcard novice/python/??-*.ipynb))) \ - novice/sql/index.md $(patsubst %.ipynb,%.md,$(sort $(wildcard novice/sql/??-*.ipynb))) \ - novice/extras/index.md $(sort $(wildcard novice/extras/??-*.md)) \ - novice/teaching/index.md $(sort $(wildcard novice/teaching/??-*.md)) \ - novice/ref/index.md $(sort $(wildcard novice/ref/??-*.md)) \ - bib.md \ - tmp/gloss.md \ - rules.md \ - LICENSE.md +SITE = _site +INSTALL = $(HOME)/sites/software-carpentry.org/v5 +LINKS = /tmp/bc-links +CACHED = cached -BOOK_TMP = \ - $(patsubst %,tmp/%,$(BOOK_SRC)) +# Templates for nbconvert and Pandoc. +IPYNB_TPL = _templates/ipynb.tpl -BOOK_DST = $(OUT)/book.html +# Temporary book file. +BOOK_MD = ./book.md -# Mark cached versions of compiled notebooks as SECONDARY so that GNU -# Make won't delete them after rebuilding. -.SECONDARY : $(NOTEBOOK_MD) +# Principal target files. +INDEX = $(SITE)/index.html -#----------------------------------------------------------- - -# Default action: show available commands (marked with double '#'). -all : commands - -## quick : build just the bootcamp home page. -quick : $(OUT)/index.html - jekyll -t build -d $(OUT) - -## install : install on the server. -install : $(OUT)/index.html - rm -rf $(INSTALL_DIR) - cp -r _site $(INSTALL_DIR) - mv $(INSTALL_DIR)/contents.html $(INSTALL_DIR)/index.html +# Directives. +.INTERMEDIATE : $(BOOK_MD) -## site : build site. -site : $(BOOK_DST) +#---------------------------------------------------------------------- +# Specify the default target before any other targets are defined so +# that we're sure which one Make will choose. +#---------------------------------------------------------------------- -$(BOOK_DST) : $(OUT)/index.html $(BOOK_TMP) _templates/book.tpl tmp/gloss.md bin/make-book.py - python bin/make-book.py $(BOOK_TMP) \ - | pandoc --email-obfuscation=none --template=_templates/book.tpl -t html -o - \ - | sed -e 's!../../gloss.html#!#g:!g' \ - | sed -e 's!../gloss.html#!#g:!g' \ - > $@ - -# Build HTML versions of Markdown source files using Jekyll. -$(OUT)/index.html : $(MARKDOWN_SRC) $(NOTEBOOK_MD) - jekyll -t build -d $(OUT) - sed -i -e 's!img src="novice/python/!img src="!g' $(OUT)/novice/python/??-*.html - -index.html setup.md : _includes/setup.html - -# Build Markdown versions of IPython Notebooks. -%.md : %.ipynb _templates/ipynb.tpl - ipython nbconvert --template=_templates/ipynb.tpl --to=markdown --output="$(subst .md,,$@)" "$<" - -# Patch targets and links in the glossary for inclusion in the book. -tmp/gloss.md : gloss.md - @mkdir -p $$(dirname $@) - sed -e 's!](#!](#g:!g' -e 's! $@ +all : commands -# Patch image paths in the sections. -tmp/novice/shell/%.md : novice/shell/%.md - @mkdir -p $$(dirname $@) - sed -e 's! $@ +#---------------------------------------------------------------------- +# Create Markdown versions of IPython Notebooks in CACHED directory. +#---------------------------------------------------------------------- -tmp/novice/git/%.md : novice/git/%.md - @mkdir -p $$(dirname $@) - sed -e 's! $@ +# IPython Notebooks (split by directory so that they can be +# interpolated into other variables later on). +IPYNB_SRC_PYTHON = $(sort $(wildcard novice/python/??-*.ipynb)) +IPYNB_SRC_SQL = $(sort $(wildcard novice/sql/??-*.ipynb)) -tmp/novice/python/%.md : novice/python/%.md - @mkdir -p $$(dirname $@) - sed -e 's! $@ +# Notebooks converted to Markdown. +IPYNB_TX_PYTHON = $(patsubst %.ipynb,$(CACHED)/%.md,$(IPYNB_SRC_PYTHON)) +IPYNB_TX_SQL = $(patsubst %.ipynb,$(CACHED)/%.md,$(IPYNB_SRC_SQL)) -tmp/novice/sql/%.md : novice/sql/%.md - @mkdir -p $$(dirname $@) - sed -e 's! $@ +# Convert a .ipynb to .md. +$(CACHED)/%.md : %.ipynb $(IPYNB_TPL) + ipython nbconvert --template=$(IPYNB_TPL) --to=markdown --output="$(subst .md,,$@)" "$<" -tmp/novice/extras/%.md : novice/extras/%.md - @mkdir -p $$(dirname $@) - sed -e 's! $@ +#---------------------------------------------------------------------- +# Build everything with Jekyll. +#---------------------------------------------------------------------- -# All other Markdown files used in the book. -tmp/%.md : %.md - @mkdir -p $$(dirname $@) - cp $< $@ +# Book source (in Markdown). These are listed in the order in which +# they appear in the final book-format version of the notes, and +# include Markdown files generated by other tools from other formats. +BOOK_SRC = \ + intro.md \ + team.md \ + novice/shell/index.md $(sort $(wildcard novice/shell/??-*.md)) \ + novice/git/index.md $(sort $(wildcard novice/git/??-*.md)) \ + novice/python/index.md $(IPYNB_TX_PYTHON) \ + novice/sql/index.md $(IPYNB_TX_SQL) \ + novice/extras/index.md $(sort $(wildcard novice/extras/??-*.md)) \ + novice/teaching/index.md $(sort $(wildcard novice/teaching/??-*.md)) \ + novice/ref/index.md $(sort $(wildcard novice/ref/??-*.md)) \ + bib.md \ + gloss.md \ + rules.md \ + LICENSE.md -#----------------------------------------------------------- +# All source pages (including things not in the book). +PAGES_SRC = \ + contents.md \ + $(BOOK_SRC) + +# Build the temporary input for the book by concatenating relevant +# sections of Markdown files, patching glossary references and image +# paths, and then running the whole shebang through Jekyll at the same +# time as everything else. +$(BOOK_MD) : $(PAGES_SRC) bin/make-book.py + python bin/make-book.py $(BOOK_SRC) > $@ + +# Convert from Markdown to HTML. This builds *all* the pages (Jekyll +# only does batch mode), and erases the SITE directory first, so +# having the output index.html file depend on all the page source +# Markdown files triggers the desired build once and only once. +$(INDEX) : $(BOOK_MD) + jekyll -t build -d $(SITE) + +#---------------------------------------------------------------------- +# Targets. +#---------------------------------------------------------------------- ## commands : show all commands. commands : @grep -E '^##' Makefile | sed -e 's/## //g' +## site : build the site as GitHub will see it. +site : $(INDEX) $(BOOK) + +## install : install on the server. +install : $(INDEX) + rm -rf $(INSTALL) + mkdir -p $(INSTALL) + cp -r $(SITE)/* $(INSTALL) + mv $(INSTALL)/contents.html $(INSTALL)/index.html + ## contribs : list contributors (uses .mailmap file). contribs : git log --pretty=format:%aN | sort | uniq ## fixme : find places where fixes are needed. fixme : - @grep -i -n FIXME $$(find -f shell git python sql -type f -print | grep -v .ipynb_checkpoints) - -## gloss : check glossary. -gloss : - @bin/gloss.py ./gloss.md $(MARKDOWN_DST) $(NOTEBOOK_DST) - -## images : create a temporary page to display images. -images : - @bin/make-image-page.py $(MARKDOWN_SRC) $(NOTEBOOK_SRC) > image-page.html - @echo "Open ./image-page.html to view images" - -## valid : check validity of HTML book. -# Depends on xmllint to check validity of generated pages. -# Also depends on linklint, an HTML link-checking module from -# http://www.linklint.org/, which has been put in bin/linklint. -# Look in output directory's 'error.txt' file for results. -valid : tmp-book.html - xmllint --noout tmp-book.html 2>&1 | python bin/unwarn.py - @bin/linklint -doc $(LINK_OUT) -textonly -root $(OUT) /@ + @grep -i -n FIXME $$(find novice -type f -print | grep -v .ipynb_checkpoints) ## clean : clean up all generated files. clean : tidy - @rm -rf $(OUT) $(NOTEBOOK_MD) + rm -rf $(SITE) -## tidy : clean up intermediate files only. +## tidy : clean up odds and ends. tidy : - @rm -rf \ - image-page.html \ - tmp \ + rm -rf \ $$(find . -name '*~' -print) \ - $$(find . -name '*.pyc' -print) \ - $$(find . -name '??-*_files' -type d -print) - -# show variables (for debugging) -show : - @echo "OUT" $(OUT) - @echo "TMP" $(TMP) - @echo "LINK_OUT" $(LINK_OUT) - @echo "MARKDOWN_SRC" $(MARKDOWN_SRC) - @echo "NOTEBOOK_SRC" $(NOTEBOOK_SRC) - @echo "NOTEBOOK_MD" $(NOTEBOOK_MD) - @echo "HTML_DST" $(HTML_DST) + $$(find . -name '*.pyc' -print) diff --git a/_includes/cheat-sheets/sql.md b/_includes/cheat-sheets/sql.md deleted file mode 100644 index 19a66fa20..000000000 --- a/_includes/cheat-sheets/sql.md +++ /dev/null @@ -1,132 +0,0 @@ -Software Carpentry SQL Cheat Sheet -================================== - -Basic Queries -------------- - -Select one or more columns of data from a table: - - SELECT column_name_1, column_name_2 FROM table_name; - -Select all of the columns in a table: - - SELECT * FROM table_name; - -Get only unique lines in a query: - - SELECT DISTINCT column_name FROM table_name; - -Perform calculations in a query: - - SELECT column_name_1, ROUND(column_name_2 / 1000.0) FROM table_name; - - -Filtering ---------- - -Select only the data meeting certain criteria: - - SELECT * FROM table_name WHERE column_name = 'Hello World'; - -Combine conditions: - - SELECT * FROM table_name WHERE (column_name_1 >= 1000) AND (column_name_2 = 'A' OR column_name_2 = 'B'); - - -Sorting -------- - -Sort results using `ASC` for ascending order or `DESC` for descending order: - - SELECT * FROM table_name ORDER BY column_name_1 ASC, column_name_2 DESC; - - -Missing Data ------------- - -Use `NULL` to represent missing data. - -`NULL` is neither true nor false. -Operations involving `NULL` produce `NULL`, e.g., `1+NULL`, `2>NULL`, and `3=NULL` are all `NULL`. - -Test whether a value is null: - - SELECT * FROM table_name WHERE column_name IS NULL; - -Test whether a value is not null: - - SELECT * FROM table_name WHERE column_name IS NOT NULL; - - -Grouping and Aggregation ------------------------- - -Combine data into groups and calculate combined values in groups: - - SELECT column_name_1, SUM(column_name_2), COUNT(*) FROM table_name GROUP BY column_name_1; - - -Joins ------ - -Join data from two tables: - - SELECT * FROM table_name_1 JOIN table_name_2 ON table_name_1.column_name = table_name_2.column_name; - - -Combining Commands ------------------- - -SQL commands must be combined in the following order: -`SELECT`, `FROM`, `JOIN`, `ON`, `WHERE`, `GROUP BY`, `ORDER BY`. - - -Creating Tables ---------------- - -Create tables by specifying column names and types. -Include primary and foreign key relationships and other constraints. - - CREATE TABLE survey( - taken INTEGER NOT NULL, - person TEXT, - quant REAL NOT NULL, - PRIMARY KEY(taken, quant), - FOREIGN KEY(person) REFERENCES person(ident) - ); - -Transactions ------------- - -Put multiple queries in a transaction to ensure they are ACID -(atomic, consistent, isolated, and durable): - - BEGIN TRANSACTION; - DELETE FROM table_name_1 WHERE condition; - INSERT INTO table_name_2 values(...); - END TRANSACTION; - -Programming ------------ - -Execute queries in a general-purpose programming language by: - -* loading the appropriate library -* creating a connection -* creating a cursor -* repeatedly: - * execute a query - * fetch some or all results -* disposing of the cursor -* closing the connection - -Python example: - - import sqlite3 - connection = sqlite3.connect("database_name") - cursor = connection.cursor() - cursor.execute("...query...") - for r in cursor.fetchall(): - ...process result r... - cursor.close() - connection.close() diff --git a/_layouts/book.html b/_layouts/book.html new file mode 100644 index 000000000..3a1bc4ed3 --- /dev/null +++ b/_layouts/book.html @@ -0,0 +1,52 @@ +--- +--- + + + + {% if page.title %} + {{ page.title }} + {% endif %} + {% include header.html %} + + + +
+ {% include banner.html %} + + + + {% include footer.html %} +
+ {% include javascript.html %} + + diff --git a/_templates/book.tpl b/_templates/book.tpl deleted file mode 100644 index 8217309e0..000000000 --- a/_templates/book.tpl +++ /dev/null @@ -1,61 +0,0 @@ - - - - - - - - - - - - - - - - - Software Carpentry - - -
- -
-
- -
-

Software Carpentry

-

Volume 1: Basics

-

edited by Greg Wilson

-
- - - -$body$ - -
-
- - -
- - diff --git a/bib.md b/bib.md index af7b39cfa..71e087b3e 100644 --- a/bib.md +++ b/bib.md @@ -3,8 +3,8 @@ layout: lesson root: . title: Recommended Reading --- -Papers ------- + +## Papers * Susan A. Ambrose, Michael W. Bridges, Michele DiPietro, Marsha C. Lovett, and Marie K. Norman: *[How Learning Works: Seven Research-Based Principles for Smart Teaching](http://www.amazon.com/How-Learning-Works-Research-Based-Principles/dp/0470484101/)*. @@ -74,8 +74,7 @@ Papers Describes what we've learned about how to teach programming to scientists over the last 15 years. -Books ------ +## Books * Chris Fehily: *SQL: Visual QuickStart Guide* (3rd ed). diff --git a/bin/make-book.py b/bin/make-book.py index b50addbc4..d3c2310dc 100644 --- a/bin/make-book.py +++ b/bin/make-book.py @@ -1,29 +1,37 @@ import sys import os.path +# Header required to make this a Jekyll file. +HEADER = '''--- +layout: book +root: . +---''' + def main(): + print HEADER for filename in sys.argv[1:]: with open(filename, 'r') as reader: lines = reader.readlines() - title = extract_title(filename, lines) - lines = skip(filename, lines, True, '---', '---') - lines = skip(filename, lines, False, '
') + title = None + if lines[0].startswith('---'): + lines, skipped = skip(filename, lines, '---', '---') + title = extract_title(filename, skipped) + + lines, _ = skip(filename, lines, '
') + + lines = fix_image_paths(filename, lines) + lines = fix_gloss(filename, lines) if title: - print make_title(filename, title) + print format_title(filename, title) for line in lines: print line.rstrip() print -def extract_title(filename, lines): - for ln in lines: - if ln.startswith('title:'): - return ln.split(':', 1)[1].strip() - return None - -def skip(filename, lines, required, open, close): +def skip(filename, lines, open, close): + '''Skip a block of lines starting with open and ending with close.''' i_open = None i_close = None for (i, ln) in enumerate(lines): @@ -31,15 +39,41 @@ def skip(filename, lines, required, open, close): i_open = i elif (i_open is not None) and ln.startswith(close): i_close = i - return lines[:i_open] + lines[i_close+1:] - assert not required, 'Did not find "{0}" to "{1}" in {2}'.format(open, close, filename) + return lines[:i_open] + lines[i_close+1:], lines[i_open:i_close] + else: + return lines, None + +def fix_image_paths(filename, lines): + '''Modify image paths to include directory.''' + front, _ = os.path.split(filename) + front = front.replace('cached/', '') + src = '', title, '
']) + return '\n'.join(['
', title, '
']) else: return title diff --git a/bin/make-image-page.py b/bin/make-image-page.py deleted file mode 100755 index b9b0a8ba4..000000000 --- a/bin/make-image-page.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python - -'''Make a temporary page displaying all the images used in a set of lessons.''' - -import sys -import os -import re - -HEADER = ''' -''' - -ENTRY = '
{1}

{1} ({0})

' - -FOOTER = ''' -''' - -MARKDOWN_P = re.compile(r'') -IPYNB_P = re.compile(r'') - -def main(filenames): - print HEADER - for f in filenames: - with open(f, 'r') as reader: - for line in reader: - display(f, line) - print FOOTER - -def display(filename, line): - for p in (MARKDOWN_P, IPYNB_P): - m = p.search(line) - if not m: continue - relative_path = m.group(1) - alt_text = m.group(2) - modified_path = adjust_path(filename, relative_path) - print ENTRY.format(modified_path, alt_text) - -def adjust_path(base_filename, relative_path): - fixed = os.path.join(os.path.dirname(base_filename), relative_path) - return fixed.replace('/files/', '/') - -if __name__ == '__main__': - main(sys.argv[1:]) diff --git a/cached/novice/python/01-numpy.md b/cached/novice/python/01-numpy.md new file mode 100644 index 000000000..e87745d78 --- /dev/null +++ b/cached/novice/python/01-numpy.md @@ -0,0 +1,709 @@ +--- +layout: lesson +root: ../.. +--- + +## Analyzing Patient Data + + +
+

We are studying inflammation in patients who have been given a new treatment for arthritis, and need to analyze the first dozen data sets. The data sets are stored in comma-separated values (CSV) format: each row holds information for a single patient, and the columns represent successive days. The first few rows of our first file look like this:

+
0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0
+0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1
+0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1
+0,0,2,0,4,2,2,1,6,7,10,7,9,13,8,8,15,10,10,7,17,4,4,7,6,15,6,4,9,11,3,5,6,3,3,4,2,3,2,1
+0,1,1,3,3,1,3,5,2,4,4,7,6,5,3,10,8,10,6,17,9,14,9,7,13,9,12,6,7,7,9,6,3,2,2,4,2,0,1,1
+
+ + +
+

We want to:

+
    +
  • load that data into memory,
  • +
  • calculate the average inflammation per day across all patients, and
  • +
  • plot the result.
  • +
+

To do all that, we'll have to learn a little bit about programming.

+
+ + +
+

Objectives

+
    +
  • Explain what a library is, and what libraries are used for.
  • +
  • Load a Python library and use the things it contains.
  • +
  • Read tabular data from a file into a program.
  • +
  • Assign values to variables.
  • +
  • Select individual values and subsections from data.
  • +
  • Perform operations on arrays of data.
  • +
  • Display simple graphs.
  • +
+
+ +### Loading Data + + +
+

Words are useful, but what's more useful are the sentences and stories we use them to build. Similarly, while a lot of powerful tools are built into languages like Python, even more lives in the libraries they are used to build.

+

In order to load our inflammation data, we need to import a library called NumPy that knows how to operate on matrices:

+
+ + +
+
import numpy
+
+ + +
+

Importing a library is like getting a piece of lab equipment out of a storage locker and setting it up on the bench. Once it's done, we can ask the library to read our data file for us:

+
+ + +
+
numpy.loadtxt(fname='inflammation-01.csv', delimiter=',')
+
+ +
+
array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
+       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
+       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
+       ..., 
+       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
+       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
+       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])
+
+ + +
+

The expression numpy.loadtxt(...) is a function call that asks Python to run the function loadtxt that belongs to the numpy library. This dotted notation is used everywhere in Python to refer to the parts of things as whole.part.

+

numpy.loadtxt has two parameters: the name of the file we want to read, and the delimiter that separates values on a line. These both need to be character strings (or strings for short), so we put them in quotes.

+

When we are finished typing and press Shift+Enter, the notebook runs our command. Since we haven't told it to do anything else with the function's output, the notebook displays it. In this case, that output is the data we just loaded. By default, only a few rows and columns are shown (with ... displayed to mark missing data). To save space, Python displays numbers as 1. instead of 1.0 when there's nothing interesting after the decimal point.

+
+ + +
+

Our call to numpy.loadtxt read our file, but didn't save the data in memory. To do that, we need to assign the array to a variable. A variable is just a name for a value, such as x, current_temperature, or subject_id. We can create a new variable simply by assigning a value to it using =:

+
+ + +
+
weight_kg = 55
+
+ + +
+

Once a variable has a value, we can print it:

+
+ + +
+
print weight_kg
+
+ +
+
55
+
+
+ + +
+

and do arithmetic with it:

+
+ + +
+
print 'weight in pounds:', 2.2 * weight_kg
+
+ +
+
weight in pounds: 121.0
+
+
+ + +
+

We can also change a variable's value by assigning it a new one:

+
+ + +
+
weight_kg = 57.5
+print 'weight in kilograms is now:', weight_kg
+
+ +
+
weight in kilograms is now: 57.5
+
+
+ + +
+

As the example above shows, we can print several things at once by separating them with commas.

+

If we imagine the variable as a sticky note with a name written on it, assignment is like putting the sticky note on a particular value:

+
+ + +
+

Variables as Sticky Notes

+
+ + +
+

This means that assigning a value to one variable does not change the values of other variables. For example, let's store the subject's weight in pounds in a variable:

+
+ + +
+
weight_lb = 2.2 * weight_kg
+print 'weight in kilograms:', weight_kg, 'and in pounds:', weight_lb
+
+ +
+
weight in kilograms: 57.5 and in pounds: 126.5
+
+
+ + +
+

Creating Another Variable

+
+ + +
+

and then change weight_kg:

+
+ + +
+
weight_kg = 100.0
+print 'weight in kilograms is now:', weight_kg, 'and weight in pounds is still:', weight_lb
+
+ +
+
weight in kilograms is now: 100.0 and weight in pounds is still: 126.5
+
+
+ + +
+

Updating a Variable

+
+ + +
+

Since weight_lb doesn't "remember" where its value came from, it isn't automatically updated when weight_kg changes. This is different from the way spreadsheets work.

+

Now that we know how to assign things to variables, let's re-run numpy.loadtxt and save its result:

+
+ + +
+
data = numpy.loadtxt(fname='inflammation-01.csv', delimiter=',')
+
+ + +
+

This statement doesn't produce any output because assignment doesn't display anything. If we want to check that our data has been loaded, we can print the variable's value:

+
+ + +
+
print data
+
+ +
+
[[ 0.  0.  1. ...,  3.  0.  0.]
+ [ 0.  1.  2. ...,  1.  0.  1.]
+ [ 0.  1.  1. ...,  2.  1.  1.]
+ ..., 
+ [ 0.  1.  1. ...,  1.  1.  1.]
+ [ 0.  0.  0. ...,  0.  2.  0.]
+ [ 0.  0.  1. ...,  1.  1.  0.]]
+
+
+ + +
+

Challenges

+
    +
  1. Draw diagrams showing what variables refer to what values after each statement in the following program:

    +
    mass = 47.5
    +age = 122
    +mass = mass * 2.0
    +age = age - 20
  2. +
  3. What does the following program print out? ~python first, second = 'Grace', 'Hopper' third, fourth = second, first print third, fourth ~

  4. +
+
+ +### Manipulating Data + + +
+

Now that our data is in memory, we can start doing things with it. First, let's ask what type of thing data refers to:

+
+ + +
+
print type(data)
+
+ +
+
<type 'numpy.ndarray'>
+
+
+ + +
+

The output tells us that data currently refers to an N-dimensional array created by the NumPy library. We can see what its shape is like this:

+
+ + +
+
print data.shape
+
+ +
+
(60, 40)
+
+
+ + +
+

This tells us that data has 60 rows and 40 columns. data.shape is a member of data, i.e., a value that is stored as part of a larger value. We use the same dotted notation for the members of values that we use for the functions in libraries because they have the same part-and-whole relationship.

+
+ + +
+

If we want to get a single value from the matrix, we must provide an index in square brackets, just as we do in math:

+
+ + +
+
print 'first value in data:', data[0, 0]
+
+ +
+
first value in data: 0.0
+
+
+ + +
+
print 'middle value in data:', data[30, 20]
+
+ +
+
middle value in data: 13.0
+
+
+ + +
+

The expression data[30, 20] may not surprise you, but data[0, 0] might. Programming languages like Fortran and MATLAB start counting at 1, because that's what human beings have done for thousands of years. Languages in the C family (including C++, Java, Perl, and Python) count from 0 because that's simpler for computers to do. As a result, if we have an M×N array in Python, its indices go from 0 to M-1 on the first axis and 0 to N-1 on the second. It takes a bit of getting used to, but one way to remember the rule is that the index is how many steps we have to take from the start to get the item we want.

+
+

In the Corner

+

What may also surprise you is that when Python displays an array, it shows the element with index [0, 0] in the upper left corner rather than the lower left. This is consistent with the way mathematicians draw matrices, but different from the Cartesian coordinates. The indices are (row, column) instead of (column, row) for the same reason.

+
+
+ + +
+

An index like [30, 20] selects a single element of an array, but we can select whole sections as well. For example, we can select the first ten days (columns) of values for the first four (rows) patients like this:

+
+ + +
+
print data[0:4, 0:10]
+
+ +
+
[[ 0.  0.  1.  3.  1.  2.  4.  7.  8.  3.]
+ [ 0.  1.  2.  1.  2.  1.  3.  2.  2.  6.]
+ [ 0.  1.  1.  3.  3.  2.  6.  2.  5.  9.]
+ [ 0.  0.  2.  0.  4.  2.  2.  1.  6.  7.]]
+
+
+ + +
+

The slice 0:4 means, "Start at index 0 and go up to, but not including, index 4." Again, the up-to-but-not-including takes a bit of getting used to, but the rule is that the difference between the upper and lower bounds is the number of values in the slice.

+

We don't have to start slices at 0:

+
+ + +
+
print data[5:10, 0:10]
+
+ +
+
[[ 0.  0.  1.  2.  2.  4.  2.  1.  6.  4.]
+ [ 0.  0.  2.  2.  4.  2.  2.  5.  5.  8.]
+ [ 0.  0.  1.  2.  3.  1.  2.  3.  5.  3.]
+ [ 0.  0.  0.  3.  1.  5.  6.  5.  5.  8.]
+ [ 0.  1.  1.  2.  1.  3.  5.  3.  5.  8.]]
+
+
+ + +
+

and we don't have to take all the values in the slice---if we provide a stride, Python takes values spaced that far apart:

+
+ + +
+
print data[0:10:3, 0:10:2]
+
+ +
+
[[ 0.  1.  1.  4.  8.]
+ [ 0.  2.  4.  2.  6.]
+ [ 0.  2.  4.  2.  5.]
+ [ 0.  1.  1.  5.  5.]]
+
+
+ + +
+

Here, we have taken rows 0, 3, 6, and 9, and columns 0, 2, 4, 6, and 8. (Again, we always include the lower bound, but stop when we reach or cross the upper bound.)

+
+ + +
+

We also don't have to include the upper and lower bound on the slice. If we don't include the lower bound, Python uses 0 by default; if we don't include the upper, the slice runs to the end of the axis, and if we don't include either (i.e., if we just use ':' on its own), the slice includes everything:

+
+ + +
+
small = data[:3, 36:]
+print 'small is:'
+print small
+
+ +
+
small is:
+[[ 2.  3.  0.  0.]
+ [ 1.  1.  0.  1.]
+ [ 2.  2.  1.  1.]]
+
+
+ + +
+

Arrays also know how to perform common mathematical operations on their values. If we want to find the average inflammation for all patients on all days, for example, we can just ask the array for its mean value

+
+ + +
+
print data.mean()
+
+ +
+
6.14875
+
+
+ + +
+

mean is a method of the array, i.e., a function that belongs to it in the same way that the member shape does. If variables are nouns, methods are verbs: they are what the thing in question knows how to do. This is why data.shape doesn't need to be called (it's just a thing) but data.mean() does (it's an action). It is also why we need empty parentheses for data.mean(): even when we're not passing in any parameters, parentheses are how we tell Python to go and do something for us.

+

NumPy arrays have lots of useful methods:

+
+ + +
+
print 'maximum inflammation:', data.max()
+print 'minimum inflammation:', data.min()
+print 'standard deviation:', data.std()
+
+ +
+
maximum inflammation: 20.0
+minimum inflammation: 0.0
+standard deviation: 4.61383319712
+
+
+ + +
+

When analyzing data, though, we often want to look at partial statistics, such as the maximum value per patient or the average value per day. One way to do this is to select the data we want to create a new temporary array, then ask it to do the calculation:

+
+ + +
+
patient_0 = data[0, :] # 0 on the first axis, everything on the second
+print 'maximum inflammation for patient 0:', patient_0.max()
+
+ +
+
maximum inflammation for patient 0: 18.0
+
+
+ + +
+

We don't actually need to store the row in a variable of its own. Instead, we can combine the selection and the method call:

+
+ + +
+
print 'maximum inflammation for patient 2:', data[2, :].max()
+
+ +
+
maximum inflammation for patient 2: 19.0
+
+
+ + +
+

What if we need the maximum inflammation for all patients, or the average for each day? As the diagram below shows, we want to perform the operation across an axis:

+
+ + +
+

Operations Across Axes

+
+ + +
+

To support this, most array methods allow us to specify the axis we want to work on. If we ask for the average across axis 0, we get:

+
+ + +
+
print data.mean(axis=0)
+
+ +
+
[  0.           0.45         1.11666667   1.75         2.43333333   3.15
+   3.8          3.88333333   5.23333333   5.51666667   5.95         5.9
+   8.35         7.73333333   8.36666667   9.5          9.58333333
+  10.63333333  11.56666667  12.35        13.25        11.96666667
+  11.03333333  10.16666667  10.           8.66666667   9.15         7.25
+   7.33333333   6.58333333   6.06666667   5.95         5.11666667   3.6
+   3.3          3.56666667   2.48333333   1.5          1.13333333
+   0.56666667]
+
+
+ + +
+

As a quick check, we can ask this array what its shape is:

+
+ + +
+
print data.mean(axis=0).shape
+
+ +
+
(40,)
+
+
+ + +
+

The expression (40,) tells us we have an N×1 vector, so this is the average inflammation per day for all patients. If we average across axis 1, we get:

+
+ + +
+
print data.mean(axis=1)
+
+ +
+
[ 5.45   5.425  6.1    5.9    5.55   6.225  5.975  6.65   6.625  6.525
+  6.775  5.8    6.225  5.75   5.225  6.3    6.55   5.7    5.85   6.55
+  5.775  5.825  6.175  6.1    5.8    6.425  6.05   6.025  6.175  6.55
+  6.175  6.35   6.725  6.125  7.075  5.725  5.925  6.15   6.075  5.75
+  5.975  5.725  6.3    5.9    6.75   5.925  7.225  6.15   5.95   6.275  5.7
+  6.1    6.825  5.975  6.725  5.7    6.25   6.4    7.05   5.9  ]
+
+
+ + +
+

which is the average inflammation per patient across all days.

+
+ + +
+

Challenges

+

A subsection of an array is called a slice. We can take slices of character strings as well:

+
+ + +
+
element = 'oxygen'
+print 'first three characters:', element[0:3]
+print 'last three characters:', element[3:6]
+
+ +
+
first three characters: oxy
+last three characters: gen
+
+
+ + +
+
    +
  1. What is the value of element[:4]? What about element[4:]? Or element[:]?

  2. +
  3. What is element[-1]? What is element[-2]? Given those answers, explain what element[1:-1] does.

  4. +
  5. The expression element[3:3] produces an empty string, i.e., a string that contains no characters. If data holds our array of patient data, what does data[3:3, 4:4] produce? What about data[3:3, :]?

  6. +
+
+ +### Plotting + + +
+

The mathematician Richard Hamming once said, "The purpose of computing is insight, not numbers," and the best way to develop insight is often to visualize data. Visualization deserves an entire lecture (or course) of its own, but we can explore a few features of Python's matplotlib here. First, let's tell the IPython Notebook that we want our plots displayed inline, rather than in a separate viewing window:

+
+ + +
+
%matplotlib inline
+
+ + +
+

The % at the start of the line signals that this is a command for the notebook, rather than a statement in Python. Next, we will import the pyplot module from matplotlib and use two of its functions to create and display a heat map of our data:

+
+ + +
+
from matplotlib import pyplot
+pyplot.imshow(data)
+pyplot.show()
+
+ +
+

+
+ + +
+

Blue regions in this heat map are low values, while red shows high values. As we can see, inflammation rises and falls over a 40-day period. Let's take a look at the average inflammation over time:

+
+ + +
+
ave_inflammation = data.mean(axis=0)
+pyplot.plot(ave_inflammation)
+pyplot.show()
+
+ +
+

+
+ + +
+

Here, we have put the average per day across all patients in the variable ave_inflammation, then asked pyplot to create and display a line graph of those values. The result is roughly a linear rise and fall, which is suspicious: based on other studies, we expect a sharper rise and slower fall. Let's have a look at two other statistics:

+
+ + +
+
print 'maximum inflammation per day'
+pyplot.plot(data.max(axis=0))
+pyplot.show()
+
+print 'minimum inflammation per day'
+pyplot.plot(data.min(axis=0))
+pyplot.show()
+
+ +
+
maximum inflammation per day
+minimum inflammation per day
+
+
+ + +
+

The maximum value rises and falls perfectly smoothly, while the minimum seems to be a step function. Neither result seems particularly likely, so either there's a mistake in our calculations or something is wrong with our data.

+
+ + +
+

Challenges

+
    +
  1. Why do all of our plots stop just short of the upper end of our graph? Why are the vertical lines in our plot of the minimum inflammation per day not vertical?

  2. +
  3. Create a plot showing the standard deviation of the inflammation data for each day across all patients.

  4. +
+
+ +### Wrapping Up + + +
+

It's very common to create an alias for a library when importing it in order to reduce the amount of typing we have to do. Here are our three plots side by side using aliases for numpy and pyplot:

+
+ + +
+
import numpy as np
+from matplotlib import pyplot as plt
+
+data = np.loadtxt(fname='inflammation-01.csv', delimiter=',')
+
+plt.figure(figsize=(10.0, 3.0))
+
+plt.subplot(1, 3, 1)
+plt.ylabel('average')
+plt.plot(data.mean(0))
+
+plt.subplot(1, 3, 2)
+plt.ylabel('max')
+plt.plot(data.max(0))
+
+plt.subplot(1, 3, 3)
+plt.ylabel('min')
+plt.plot(data.min(0))
+
+plt.tight_layout()
+plt.show()
+
+ +
+

+
+ + +
+

The first two lines re-load our libraries as np and plt, which are the aliases most Python programmers use. The call to loadtxt reads our data, and the rest of the program tells the plotting library how large we want the figure to be, that we're creating three sub-plots, what to draw for each one, and that we want a tight layout. (Perversely, if we leave out that call to plt.tight_layout(), the graphs will actually be squeezed together more closely.)

+
+ + +
+

Challenges

+
    +
  1. Modify the program to display the three plots on top of one another instead of side by side.
  2. +
+
+ + +
+

Key Points

+
    +
  • Import a library into a program using import libraryname.
  • +
  • Use the numpy library to work with arrays in Python.
  • +
  • Use variable = value to assign a value to a variable in order to record it in memory.
  • +
  • Variables are created on demand whenever a value is assigned to them.
  • +
  • Use print something to display the value of something.
  • +
  • The expression array.shape gives the shape of an array.
  • +
  • Use array[x, y] to select a single element from an array.
  • +
  • Array indices start at 0, not 1.
  • +
  • Use low:high to specify a slice that includes the indices from low to high-1.
  • +
  • All the indexing and slicing that works on arrays also works on strings.
  • +
  • Use # some kind of explanation to add comments to programs.
  • +
  • Use array.mean(), array.max(), and array.min() to calculate simple statistics.
  • +
  • Use array.mean(axis=0) or array.mean(axis=1) to calculate statistics across the specified axis.
  • +
  • Use the pyplot library from matplotlib for creating simple visualizations.
  • +
+
+ + +
+

Next Steps

+

Our work so far has convinced us that something's wrong with our first data file. We would like to check the other 11 the same way, but typing in the same commands repeatedly is tedious and error-prone. Since computers don't get bored (that we know of), we should create a way to do a complete analysis with a single command, and then figure out how to repeat that step once for each file. These operations are the subjects of the next two lessons.

+
diff --git a/cached/novice/python/02-func.md b/cached/novice/python/02-func.md new file mode 100644 index 000000000..87e3f95f3 --- /dev/null +++ b/cached/novice/python/02-func.md @@ -0,0 +1,850 @@ +--- +layout: lesson +root: ../.. +--- + +## Creating Functions + + +
+

If we only had one data set to analyze, it would probably be faster to load the file into a spreadsheet and use that to plot some simple statistics. But we have twelve files to check, and may have more in future. In this lesson, we'll learn how to write a function so that we can repeat several operations with a single command.

+
+ + +
+

Objectives

+
    +
  • Define a function that takes parameters.
  • +
  • Return a value from a function.
  • +
  • Test and debug a function.
  • +
  • Explain what a call stack is, and trace changes to the call stack as functions are called.
  • +
  • Set default values for function parameters.
  • +
  • Explain why we should divide programs into small, single-purpose functions.
  • +
+
+ +### Defining a Function + + +
+

Let's start by defining a function fahr_to_kelvin that converts temperatures from Fahrenheit to Kelvin:

+
+ + +
+
def fahr_to_kelvin(temp):
+    return ((temp - 32) * (5/9)) + 273.15
+
+ + +
+

The definition opens with the word def, which is followed by the name of the function and a parenthesized list of parameter names. The body of the function—the statements that are executed when it runs—is indented below the definition line, typically by four spaces.

+

When we call the function, the values we pass to it are assigned to those variables so that we can use them inside the function. Inside the function, we use a return statement to send a result back to whoever asked for it.

+

Let's try running our function. Calling our own function is no different from calling any other function:

+
+ + +
+
print 'freezing point of water:', fahr_to_kelvin(32)
+print 'boiling point of water:', fahr_to_kelvin(212)
+
+ +
+
freezing point of water: 273.15
+boiling point of water: 273.15
+
+
+ + +
+

We've successfully called the function that we defined, and we have access to the value that we returned. Unfortunately, the value returned doesn't look right. What went wrong?

+
+ +### Debugging a Function + + +
+

Debugging is when we fix a piece of code that we know is working incorrectly. In this case, we know that fahr_to_kelvin is giving us the wrong answer, so let's find out why.

+

For big pieces of code, there are tools called debuggers that aid in this process.

+

We just have a short function, so we'll debug by choosing some parameter value, breaking our function into small parts, and printing out the value of each part.

+
+ + +
+
# We'll use temp = 212, the boiling point of water, which was incorrect
+print "212 - 32:", 212 - 32
+
+ +
+
212 - 32: 180
+
+
+ + +
+
print "(212 - 32) * (5/9):", (212 - 32) * (5/9)
+
+ +
+
(212 - 32) * (5/9): 0
+
+
+ + +
+

Aha! The problem comes when we multiply by 5/9. This is because 5/9 is actually 0.

+
+ + +
+
5/9
+
+ +
+
0
+
+ + +
+

Computers store numbers in one of two ways: as integers or as floating-point numbers (or floats). The first are the numbers we usually count with; the second have fractional parts. Addition, subtraction and multiplication work on both as we'd expect, but division works differently. If we divide one integer by another, we get the quotient without the remainder:

+
+ + +
+
print '10/3 is:', 10/3
+
+ +
+
10/3 is: 3
+
+
+ + +
+

If either part of the division is a float, on the other hand, the computer creates a floating-point answer:

+
+ + +
+
print '10.0/3 is:', 10.0/3
+
+ +
+
10.0/3 is: 3.33333333333
+
+
+ + +
+

The computer does this for historical reasons: integer operations were much faster on early machines, and this behavior is actually useful in a lot of situations. It's still confusing, though, so Python 3 produces a floating-point answer when dividing integers if it needs to. We're still using Python 2.7 in this class, though, so if we want 5/9 to give us the right answer, we have to write it as 5.0/9, 5/9.0, or some other variation.

+
+ + +
+

Let's fix our fahr_to_kelvin function with this new knowledge.

+
+ + +
+
def fahr_to_kelvin(temp):
+    return ((temp - 32) * (5.0/9.0)) + 273.15
+
+print 'freezing point of water:', fahr_to_kelvin(32)
+print 'boiling point of water:', fahr_to_kelvin(212)
+
+ +
+
freezing point of water: 273.15
+boiling point of water: 373.15
+
+
+ + +
+

It works!

+
+ +### Composing Functions + + +
+

Now that we've seen how to turn Fahrenheit into Kelvin, it's easy to turn Kelvin into Celsius:

+
+ + +
+
def kelvin_to_celsius(temp):
+    return temp - 273.15
+
+print 'absolute zero in Celsius:', kelvin_to_celsius(0.0)
+
+ +
+
absolute zero in Celsius: -273.15
+
+
+ + +
+

What about converting Fahrenheit to Celsius? We could write out the formula, but we don't need to. Instead, we can compose the two functions we have already created:

+
+ + +
+
def fahr_to_celsius(temp):
+    temp_k = fahr_to_kelvin(temp)
+    result = kelvin_to_celsius(temp_k)
+    return result
+
+print 'freezing point of water in Celsius:', fahr_to_celsius(32.0)
+
+ +
+
freezing point of water in Celsius: 0.0
+
+
+ + +
+

This is our first taste of how larger programs are built: we define basic operations, then combine them in ever-large chunks to get the effect we want. Real-life functions will usually be larger than the ones shown here—typically half a dozen to a few dozen lines—but they shouldn't ever be much longer than that, or the next person who reads it won't be able to understand what's going on.

+
+ + +
+

Challenges

+
    +
  1. "Adding" two strings produces their concatention: 'a' + 'b' is 'ab'. Write a function called fence that takes two parameters called original and wrapper and returns a new string that has the wrapper character at the beginning and end of the original:

    +
    print fence('name', '*')
    +*name*
  2. +
  3. If the variable s refers to a string, then s[0] is the string's first character and s[-1] is its last. Write a function called outer that returns a string made up of just the first and last characters of its input:

    +
    print outer('helium')
    +hm
  4. +
+
+ +### The Call Stack + + +
+

Let's take a closer look at what happens when we call fahr_to_celsius(32.0). To make things clearer, we'll start by putting the initial value 32.0 in a variable and store the final result in one as well:

+
+ + +
+
original = 32.0
+final = fahr_to_celsius(original)
+
+ + +
+

The diagram below shows what memory looks like after the first line has been executed:

+
+ + +
+

Call Stack (Initial State)

+
+ + +
+

When we call fahr_to_celsius, Python doesn't create the variable temp right away. Instead, it creates something called a stack frame to keep track of the variables defined by fahr_to_kelvin. Initially, this stack frame only holds the value of temp:

+
+ + +
+

Call Stack Immediately After First Function Call

+
+ + +
+

When we call fahr_to_kelvin inside fahr_to_celsius, Python creates another stack frame to hold fahr_to_kelvin's variables:

+
+ + +
+

Call Stack During First Nested Function Call

+
+ + +
+

It does this because there are now two variables in play called temp: the parameter to fahr_to_celsius, and the parameter to fahr_to_kelvin. Having two variables with the same name in the same part of the program would be ambiguous, so Python (and every other modern programming language) creates a new stack frame for each function call to keep that function's variables separate from those defined by other functions.

+

When the call to fahr_to_kelvin returns a value, Python throws away fahr_to_kelvin's stack frame and creates a new variable in the stack frame for fahr_to_celsius to hold the temperature in Kelvin:

+
+ + +
+

Call Stack After Return From First Nested Function Call

+
+ + +
+

It then calls kelvin_to_celsius, which means it creates a stack frame to hold that function's variables:

+
+ + +
+

Call Stack During Call to Second Nested Function

+
+ + +
+

Once again, Python throws away that stack frame when kelvin_to_celsius is done and creates the variable result in the stack frame for fahr_to_celsius:

+
+ + +
+

Call Stack After Second Nested Function Returns

+
+ + +
+

Finally, when fahr_to_celsius is done, Python throws away its stack frame and puts its result in a new variable called final that lives in the stack frame we started with:

+
+ + +
+

Call Stack After All Functions Have Finished

+
+ + +
+

This final stack frame is always there; it holds the variables we defined outside the functions in our code. What it doesn't hold is the variables that were in the various stack frames. If we try to get the value of temp after our functions have finished running, Python tells us that there's no such thing:

+
+ + +
+
print 'final value of temp after all function calls:', temp
+
+ +
+
---------------------------------------------------------------------------
+NameError                                 Traceback (most recent call last)
+<ipython-input-12-ffd9b4dbd5f1> in <module>()
+----> 1 print 'final value of temp after all function calls:', temp
+
+NameError: name 'temp' is not definedfinal value of temp after all function calls:
+
+ + +
+

Why go to all this trouble? Well, here's a function called span that calculates the difference between the mininum and maximum values in an array:

+
+ + +
+
import numpy
+
+def span(a):
+    diff = a.max() - a.min()
+    return diff
+
+data = numpy.loadtxt(fname='inflammation-01.csv', delimiter=',')
+print 'span of data', span(data)
+
+ +
+
 span of data 20.0
+
+
+ + +
+

Notice that span assigns a value to a variable called diff. We might very well use a variable with the same name to hold data:

+
+ + +
+
diff = numpy.loadtxt(fname='inflammation-01.csv', delimiter=',')
+print 'span of data:', span(diff)
+
+ +
+
span of data: 20.0
+
+
+ + +
+

We don't expect diff to have the value 20.0 after this function call, so the name diff cannot refer to the same thing inside span as it does in the main body of our program. And yes, we could probably choose a different name than diff in our main program in this case, but we don't want to have to read every line of NumPy to see what variable names its functions use before calling any of those functions, just in case they change the values of our variables.

+
+ + +
+

The big idea here is encapsulation, and it's the key to writing correct, comprehensible programs. A function's job is to turn several operations into one so that we can think about a single function call instead of a dozen or a hundred statements each time we want to do something. That only works if functions don't interfere with each other; if they do, we have to pay attention to the details once again, which quickly overloads our short-term memory.

+
+ + +
+

Challenges

+
    +
  1. We previously wrote functions called fence and outer. Draw a diagram showing how the call stack changes when we run the following: ~python print outer(fence('carbon', '+')) ~
  2. +
+
+ +### Testing and Documenting + + +
+

Once we start putting things in functions so that we can re-use them, we need to start testing that those functions are working correctly. To see how to do this, let's write a function to center a dataset around a particular value:

+
+ + +
+
def center(data, desired):
+    return (data - data.mean()) + desired
+
+ + +
+

We could test this on our actual data, but since we don't know what the values ought to be, it will be hard to tell if the result was correct. Instead, let's use NumPy to create a matrix of 0's and then center that around 3:

+
+ + +
+
z = numpy.zeros((2,2))
+print center(z, 3)
+
+ +
+
[[ 3.  3.]
+ [ 3.  3.]]
+
+
+ + +
+

That looks right, so let's try center on our real data:

+
+ + +
+
data = numpy.loadtxt(fname='inflammation-01.csv', delimiter=',')
+print center(data, 0)
+
+ +
+
[[-6.14875 -6.14875 -5.14875 ..., -3.14875 -6.14875 -6.14875]
+ [-6.14875 -5.14875 -4.14875 ..., -5.14875 -6.14875 -5.14875]
+ [-6.14875 -5.14875 -5.14875 ..., -4.14875 -5.14875 -5.14875]
+ ..., 
+ [-6.14875 -5.14875 -5.14875 ..., -5.14875 -5.14875 -5.14875]
+ [-6.14875 -6.14875 -6.14875 ..., -6.14875 -4.14875 -6.14875]
+ [-6.14875 -6.14875 -5.14875 ..., -5.14875 -5.14875 -6.14875]]
+
+
+ + +
+

It's hard to tell from the default output whether the result is correct, but there are a few simple tests that will reassure us:

+
+ + +
+
print 'original min, mean, and max are:', data.min(), data.mean(), data.max()
+centered = center(data, 0)
+print 'min, mean, and and max of centered data are:', centered.min(), centered.mean(), centered.max()
+
+ +
+
original min, mean, and max are: 0.0 6.14875 20.0
+min, mean, and and max of centered data are: -6.14875 -3.49054118942e-15 13.85125
+
+
+ + +
+

That seems almost right: the original mean was about 6.1, so the lower bound from zero is how about -6.1. The mean of the centered data isn't quite zero—we'll explore why not in the challenges—but it's pretty close. We can even go further and check that the standard deviation hasn't changed:

+
+ + +
+
print 'std dev before and after:', data.std(), centered.std()
+
+ +
+
std dev before and after: 4.61383319712 4.61383319712
+
+
+ + +
+

Those values look the same, but we probably wouldn't notice if they were different in the sixth decimal place. Let's do this instead:

+
+ + +
+
print 'difference in standard deviations before and after:', data.std() - centered.std()
+
+ +
+
difference in standard deviations before and after: -3.5527136788e-15
+
+
+ + +
+

Again, the difference is very small. It's still possible that our function is wrong, but it seems unlikely enough that we should probably get back to doing our analysis. We have one more task first, though: we should write some documentation for our function to remind ourselves later what it's for and how to use it.

+

The usual way to put documentation in software is to add comments like this:

+
+ + +
+
# center(data, desired): return a new array containing the original data centered around the desired value.
+def center(data, desired):
+    return (data - data.mean()) + desired
+
+ + +
+

There's a better way, though. If the first thing in a function is a string that isn't assigned to a variable, that string is attached to the function as its documentation:

+
+ + +
+
def center(data, desired):
+    '''Return a new array containing the original data centered around the desired value.'''
+    return (data - data.mean()) + desired
+
+ + +
+

This is better because we can now ask Python's built-in help system to show us the documentation for the function:

+
+ + +
+
help(center)
+
+ +
+
Help on function center in module __main__:
+
+center(data, desired)
+    Return a new array containing the original data centered around the desired value.
+
+
+
+ + +
+

A string like this is called a docstring. We don't need to use triple quotes when we write one, but if we do, we can break the string across multiple lines:

+
+ + +
+
def center(data, desired):
+    '''Return a new array containing the original data centered around the desired value.
+    Example: center([1, 2, 3], 0) => [-1, 0, 1]'''
+    return (data - data.mean()) + desired
+
+help(center)
+
+ +
+
Help on function center in module __main__:
+
+center(data, desired)
+    Return a new array containing the original data centered around the desired value.
+    Example: center([1, 2, 3], 0) => [-1, 0, 1]
+
+
+
+ + +
+

Challenges

+
    +
  1. Write a function called analyze that takes a filename as a parameter and displays the three graphs produced in the previous lesson, i.e., analyze('inflammation-01.csv') should produce the graphs already shown, while analyze('inflammation-02.csv') should produce corresponding graphs for the second data set. Be sure to give your function a docstring.

  2. +
  3. Write a function rescale that takes an array as input and returns a corresponding array of values scaled to lie in the range 0.0 to 1.0. (If \(L\) and \(H\) are the lowest and highest values in the original array, then the replacement for a value \(v\) should be \((v-L) / (H-L)\).) Be sure to give the function a docstring.

  4. +
  5. Run the commands help(numpy.arange) and help(numpy.linspace) to see how to use these functions to generate regularly-spaced values, then use those values to test your rescale function.

  6. +
+
+ +### Defining Defaults + + +
+

We have passed parameters to functions in two ways: directly, as in span(data), and by name, as in numpy.loadtxt(fname='something.csv', delimiter=','). In fact, we can pass the filename to loadtxt without the fname=:

+
+ + +
+
numpy.loadtxt('inflammation-01.csv', delimiter=',')
+
+ +
+
array([[ 0.,  0.,  1., ...,  3.,  0.,  0.],
+       [ 0.,  1.,  2., ...,  1.,  0.,  1.],
+       [ 0.,  1.,  1., ...,  2.,  1.,  1.],
+       ..., 
+       [ 0.,  1.,  1., ...,  1.,  1.,  1.],
+       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
+       [ 0.,  0.,  1., ...,  1.,  1.,  0.]])
+
+ + +
+

but we still need to say delimiter=:

+
+ + +
+
numpy.loadtxt('inflammation-01.csv', ',')
+
+ +
+
---------------------------------------------------------------------------
+TypeError                                 Traceback (most recent call last)
+<ipython-input-26-e3bc6cf4fd6a> in <module>()
+----> 1 numpy.loadtxt('inflammation-01.csv', ',')
+
+/Users/gwilson/anaconda/lib/python2.7/site-packages/numpy/lib/npyio.pyc in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)
+    775     try:
+    776         # Make sure we're dealing with a proper dtype
+--> 777         dtype = np.dtype(dtype)
+    778         defconv = _getconv(dtype)
+    779 
+
+TypeError: data type "," not understood
+
+ + +
+

To understand what's going on, and make our own functions easier to use, let's re-define our center function like this:

+
+ + +
+
def center(data, desired=0.0):
+    '''Return a new array containing the original data centered around the desired value (0 by default).
+    Example: center([1, 2, 3], 0) => [-1, 0, 1]'''
+    return (data - data.mean()) + desired
+
+ + +
+

The key change is that the second parameter is now written desired=0.0 instead of just desired. If we call the function with two arguments, it works as it did before:

+
+ + +
+
test_data = numpy.zeros((2, 2))
+print center(test_data, 3)
+
+ +
+
[[ 3.  3.]
+ [ 3.  3.]]
+
+
+ + +
+

But we can also now call it with just one parameter, in which case desired is automatically assigned the default value of 0.0:

+
+ + +
+
more_data = 5 + numpy.zeros((2, 2))
+print 'data before centering:', more_data
+print 'centered data:', center(more_data)
+
+ +
+
data before centering: [[ 5.  5.]
+ [ 5.  5.]]
+centered data: [[ 0.  0.]
+ [ 0.  0.]]
+
+
+ + +
+

This is handy: if we usually want a function to work one way, but occasionally need it to do something else, we can allow people to pass a parameter when they need to but provide a default to make the normal case easier. The example below shows how Python matches values to parameters:

+
+ + +
+
def display(a=1, b=2, c=3):
+    print 'a:', a, 'b:', b, 'c:', c
+
+print 'no parameters:'
+display()
+print 'one parameter:'
+display(55)
+print 'two parameters:'
+display(55, 66)
+
+ +
+
no parameters:
+a: 1 b: 2 c: 3
+one parameter:
+a: 55 b: 2 c: 3
+two parameters:
+a: 55 b: 66 c: 3
+
+
+ + +
+

As this example shows, parameters are matched up from left to right, and any that haven't been given a value explicitly get their default value. We can override this behavior by naming the value as we pass it in:

+
+ + +
+
print 'only setting the value of c'
+display(c=77)
+
+ +
+
only setting the value of c
+a: 1 b: 2 c: 77
+
+
+ + +
+

With that in hand, let's look at the help for numpy.loadtxt:

+
+ + +
+
help(numpy.loadtxt)
+
+ +
+
Help on function loadtxt in module numpy.lib.npyio:
+
+loadtxt(fname, dtype=<type 'float'>, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0)
+    Load data from a text file.
+    
+    Each row in the text file must have the same number of values.
+    
+    Parameters
+    ----------
+    fname : file or str
+        File, filename, or generator to read.  If the filename extension is
+        ``.gz`` or ``.bz2``, the file is first decompressed. Note that
+        generators should return byte strings for Python 3k.
+    dtype : data-type, optional
+        Data-type of the resulting array; default: float.  If this is a
+        record data-type, the resulting array will be 1-dimensional, and
+        each row will be interpreted as an element of the array.  In this
+        case, the number of columns used must match the number of fields in
+        the data-type.
+    comments : str, optional
+        The character used to indicate the start of a comment;
+        default: '#'.
+    delimiter : str, optional
+        The string used to separate values.  By default, this is any
+        whitespace.
+    converters : dict, optional
+        A dictionary mapping column number to a function that will convert
+        that column to a float.  E.g., if column 0 is a date string:
+        ``converters = {0: datestr2num}``.  Converters can also be used to
+        provide a default value for missing data (but see also `genfromtxt`):
+        ``converters = {3: lambda s: float(s.strip() or 0)}``.  Default: None.
+    skiprows : int, optional
+        Skip the first `skiprows` lines; default: 0.
+    usecols : sequence, optional
+        Which columns to read, with 0 being the first.  For example,
+        ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
+        The default, None, results in all columns being read.
+    unpack : bool, optional
+        If True, the returned array is transposed, so that arguments may be
+        unpacked using ``x, y, z = loadtxt(...)``.  When used with a record
+        data-type, arrays are returned for each field.  Default is False.
+    ndmin : int, optional
+        The returned array will have at least `ndmin` dimensions.
+        Otherwise mono-dimensional axes will be squeezed.
+        Legal values: 0 (default), 1 or 2.
+        .. versionadded:: 1.6.0
+    
+    Returns
+    -------
+    out : ndarray
+        Data read from the text file.
+    
+    See Also
+    --------
+    load, fromstring, fromregex
+    genfromtxt : Load data with missing values handled as specified.
+    scipy.io.loadmat : reads MATLAB data files
+    
+    Notes
+    -----
+    This function aims to be a fast reader for simply formatted files.  The
+    `genfromtxt` function provides more sophisticated handling of, e.g.,
+    lines with missing values.
+    
+    Examples
+    --------
+    >>> from StringIO import StringIO   # StringIO behaves like a file object
+    >>> c = StringIO("0 1\n2 3")
+    >>> np.loadtxt(c)
+    array([[ 0.,  1.],
+           [ 2.,  3.]])
+    
+    >>> d = StringIO("M 21 72\nF 35 58")
+    >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
+    ...                      'formats': ('S1', 'i4', 'f4')})
+    array([('M', 21, 72.0), ('F', 35, 58.0)],
+          dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')])
+    
+    >>> c = StringIO("1,0,2\n3,0,4")
+    >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
+    >>> x
+    array([ 1.,  3.])
+    >>> y
+    array([ 2.,  4.])
+
+
+
+ + +
+

There's a lot of information here, but the most important part is the first couple of lines:

+
loadtxt(fname, dtype=<type 'float'>, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None,
+        unpack=False, ndmin=0)
+

This tells us that loadtxt has one parameter called fname that doesn't have a default value, and eight others that do. If we call the function like this:

+
numpy.loadtxt('inflammation-01.csv', ',')
+

then the filename is assigned to fname (which is what we want), but the delimiter string ',' is assigned to dtype rather than delimiter, because dtype is the second parameter in the list. That's why we don't have to provide fname= for the filename, but do have to provide delimiter= for the second parameter.

+
+ + +
+

Challenges

+
    +
  1. Rewrite the normalize function so that it scales data to lie between 0.0 and 1.0 by default, but will allow the caller to specify lower and upper bounds if they want. Compare your implementation to your neighbor's: do the two functions always behave the same way?
  2. +
+
+ + +
+

Key Points

+
    +
  • Define a function using def name(...params...).
  • +
  • The body of a function must be indented.
  • +
  • Call a function using name(...values...).
  • +
  • Numbers are stored as integers or floating-point numbers.
  • +
  • Integer division produces the whole part of the answer (not the fractional part).
  • +
  • Each time a function is called, a new stack frame is created on the call stack to hold its parameters and local variables.
  • +
  • Python looks for variables in the current stack frame before looking for them at the top level.
  • +
  • Use help(thing) to view help for something.
  • +
  • Put docstrings in functions to provide help for that function.
  • +
  • Specify default values for parameters when defining a function using name=value in the parameter list.
  • +
  • Parameters can be passed by matching based on name, by position, or by omitting them (in which case the default value is used).
  • +
+
+ + +
+

Next Steps

+

We now have a function called analyze to visualize a single data set. We could use it to explore all 12 of our current data sets like this:

+
analyze('inflammation-01.csv')
+analyze('inflammation-02.csv')
+...
+analyze('inflammation-12.csv')
+

but the chances of us typing all 12 filenames correctly aren't great, and we'll be even worse off if we get another hundred files. What we need is a way to tell Python to do something once for each file, and that will be the subject of the next lesson.

+
diff --git a/cached/novice/python/03-loop.md b/cached/novice/python/03-loop.md new file mode 100644 index 000000000..2666b4e80 --- /dev/null +++ b/cached/novice/python/03-loop.md @@ -0,0 +1,423 @@ +--- +layout: lesson +root: ../.. +--- + +## Analyzing Multiple Data Sets + + +
+

We have created a function called analyze that creates graphs of the minimum, average, and maximum daily inflammation rates for a single data set:

+
+ + +
+
%matplotlib inline
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+def analyze(filename):
+    data = np.loadtxt(fname=filename, delimiter=',')
+    
+    plt.figure(figsize=(10.0, 3.0))
+    
+    plt.subplot(1, 3, 1)
+    plt.ylabel('average')
+    plt.plot(data.mean(0))
+    
+    plt.subplot(1, 3, 2)
+    plt.ylabel('max')
+    plt.plot(data.max(0))
+    
+    plt.subplot(1, 3, 3)
+    plt.ylabel('min')
+    plt.plot(data.min(0))
+    
+    plt.tight_layout()
+    plt.show()
+
+analyze('inflammation-01.csv')
+
+ +
+

+
+ + +
+

We can use it to analyze other data sets one by one:

+
+ + +
+
analyze('inflammation-02.csv')
+
+ +
+

+
+ + +
+

but we have a dozen data sets right now and more on the way. We want to create plots for all our data sets with a single statement. To do that, we'll have to teach the computer how to repeat things.

+
+ + +
+

Objectives

+
    +
  • Explain what a for loop does.
  • +
  • Correctly write for loops to repeat simple calculations.
  • +
  • Trace changes to a loop variable as the loop runs.
  • +
  • Trace changes to other variables as they are updated by a for loop.
  • +
  • Explain what a list is.
  • +
  • Create and index lists of simple values.
  • +
  • Use a library function to get a list of filenames that match a simple wildcard pattern.
  • +
  • Use a for loop to process multiple files.
  • +
+
+ +### For Loops + + +
+

Suppose we want to print each character in the word "lead" on a line of its own. One way is to use four print statements:

+
+ + +
+
def print_characters(element):
+    print element[0]
+    print element[1]
+    print element[2]
+    print element[3]
+
+print_characters('lead')
+
+ +
+
l
+e
+a
+d
+
+
+ + +
+

but that's a bad approach for two reasons:

+
    +
  1. It doesn't scale: if we want to print the characters in a string that's hundreds of letters long, we'd be better off just typing them in.

  2. +
  3. It's fragile: if we give it a longer string, it only prints part of the data, and if we give it a shorter one, it produces an error because we're asking for characters that don't exist.

  4. +
+
+ + +
+
print_characters('tin')
+
+ +
+
---------------------------------------------------------------------------
+IndexError                                Traceback (most recent call last)
+<ipython-input-13-5bc7311e0bf3> in <module>()
+----> 1 print_characters('tin')
+
+<ipython-input-12-11460561ea56> in print_characters(element)
+      3     print element[1]
+      4     print element[2]
+----> 5     print element[3]
+      6 
+      7 print_characters('lead')
+
+IndexError: string index out of ranget
+i
+n
+
+
+ + +
+

Here's a better approach:

+
+ + +
+
def print_characters(element):
+    for char in element:
+        print char
+
+print_characters('lead')
+
+ + +
+

This is shorter---certainly shorter than something that prints every character in a hundred-letter string---and more robust as well:

+
+ + +
+
print_characters('oxygen')
+
+ + +
+

The improved version of print_characters uses a for loop to repeat an operation---in this case, printing---once for each thing in a collection. The general form of a loop is:

+
+for variable in collection:
+    do things with variable
+
+ +
+ + +
+

We can call the loop variable anything we like, but there must be a colon at the end of the line starting the loop, and we must indent the body of the loop.

+

Here's another loop that repeatedly updates a variable:

+
+ + +
+
length = 0
+for vowel in 'aeiou':
+    length = length + 1
+print 'There are', length, 'vowels'
+
+ + +
+

It's worth tracing the execution of this little program step by step. Since there are five characters in 'aeiou', the statement on line 3 will be executed five times. The first time around, length is zero (the value assigned to it on line 1) and vowel is 'a'. The statement adds 1 to the old value of length, producing 1, and updates length to refer to that new value. The next time around, vowel is 'e' and length is 1, so length is updated to be 2. After three more updates, length is 5; since there is nothing left in 'aeiou' for Python to process, the loop finishes and the print statement on line 4 tells us our final answer.

+

Note that a loop variable is just a variable that's being used to record progress in a loop. It still exists after the loop is over, and we can re-use variables previously defined as loop variables as well:

+
+ + +
+
letter = 'z'
+for letter in 'abc':
+    print letter
+print 'after the loop, letter is', letter
+
+ + +
+

Note also that finding the length of a string is such a common operation that Python actually has a built-in function to do it called len:

+
+ + +
+
print len('aeiou')
+
+ + +
+

len is much faster than any function we could write ourselves, and much easier to read than a two-line loop; it will also give us the length of many other things that we haven't met yet, so we should always use it when we can.

+
+ + +
+

Challenges

+
    +
  1. Python has a built-in function called range that creates a list of numbers: range(3) produces [0, 1, 2], range(2, 5) produces [2, 3, 4], and range(2, 10, 3) produces [2, 5, 8]. Using range, write a function that prints the \(N\) natural numbers: ~python print_N(3) 1 2 3 ~

  2. +
  3. Exponentiation is built into Python: ~python print 2**4 16 ~ It also has a function called pow that calculates the same value. Write a function called expo that uses a loop to calculate the same result.

  4. +
  5. Python's strings have methods, just like NumPy's arrays. One of these is called reverse: ~python print 'Newton'.reverse() notweN ~ Write a function called rev that does the same thing: ~python print rev('Newton') notweN ~ As always, be sure to include a docstring.

  6. +
+
+ +### Lists + + +
+

Just as a for loop is a way to do operations many times, a list is a way to store many values. Unlike NumPy arrays, there are built into the language. We create a list by putting values inside square brackets:

+
+ + +
+
odds = [1, 3, 5, 7]
+print 'odds are:', odds
+
+ + +
+

We select individual elements from lists by indexing them:

+
+ + +
+
print 'first and last:', odds[0], odds[-1]
+
+ + +
+

and if we loop over a list, the loop variable is assigned elements one at a time:

+
+ + +
+
for number in odds:
+    print number
+
+ + +
+

There is one important difference between lists and strings: we can change the values in a list, but we cannot change the characters in a string. For example:

+
+ + +
+
names = ['Newton', 'Darwing', 'Turing'] # typo in Darwin's name
+print 'names is originally:', names
+names[1] = 'Darwin' # correct the name
+print 'final value of names:', names
+
+ + +
+

works, but:

+
+ + +
+
name = 'Bell'
+name[0] = 'b'
+
+ + +
+

does not.

+
+

Ch-Ch-Ch-Changes

+

Data that can be changed is called mutable, while data that cannot be is called immutable. Like strings, numbers are immutable: there's no way to make the number 0 have the value 1 or vice versa (at least, not in Python—there actually are languages that will let people do this, with predictably confusing results). Lists and arrays, on the other hand, are mutable: both can be modified after they have been created.

+

Programs that modify data in place can be harder to understand than ones that don't because readers may have to mentally sum up many lines of code in order to figure out what the value of something actually is. On the other hand, programs that modify data in place instead of creating copies that are almost identical to the original every time they want to make a small change are much more efficient.

+
+

There are many ways to change the contents of in lists besides assigning to elements:

+
+ + +
+
odds.append(11)
+print 'odds after adding a value:', odds
+
+ + +
+
del odds[0]
+print 'odds after removing the first element:', odds
+
+ + +
+
odds.reverse()
+print 'odds after reversing:', odds
+
+ + +
+

Challenges

+
    +
  1. Write a function called total that calculates the sum of the values in a list. (Python has a built-in function called sum that does this for you. Please don't use it for this exercise.)
  2. +
+
+ +### Processing Multiple Files + + +
+

We now have almost everything we need to process all our data files. The only thing that's missing is a library with a rather unpleasant name:

+
+ + +
+
import glob
+
+ + +
+

The glob library contains a single function, also called glob, that finds files whose names match a pattern. We provide those patterns as strings: the character * matches zero or more characters, while ? matches any one character. We can use this to get the names of all the IPython Notebooks we have created so far:

+
+ + +
+
print glob.glob('*.ipynb')
+
+ +
+
['01-numpy.ipynb', '02-func.ipynb', '03-loop.ipynb', '04-cond.ipynb', '05-defensive.ipynb', '06-cmdline.ipynb', 'spatial-intro.ipynb']
+
+
+ + +
+

or to get the names of all our CSV data files:

+
+ + +
+
print glob.glob('*.csv')
+
+ +
+
['inflammation-01.csv', 'inflammation-02.csv', 'inflammation-03.csv', 'inflammation-04.csv', 'inflammation-05.csv', 'inflammation-06.csv', 'inflammation-07.csv', 'inflammation-08.csv', 'inflammation-09.csv', 'inflammation-10.csv', 'inflammation-11.csv', 'inflammation-12.csv', 'small-01.csv', 'small-02.csv', 'small-03.csv', 'swc_bc_coords.csv']
+
+
+ + +
+

As these examples show, glob.glob's result is a list of strings, which means we can loop over it to do something with each filename in turn. In our case, the "something" we want is our analyze function. Let's test it by analyzing the first three files in the list:

+
+ + +
+
filenames = glob.glob('*.csv')
+filenames = filenames[0:3]
+for f in filenames:
+    print f
+    analyze(f)
+
+ +
+
inflammation-01.csv
+inflammation-02.csv
+inflammation-03.csv
+
+
+ + +
+

Sure enough, the maxima of these data sets show exactly the same ramp as the first, and their minima show the same staircase structure.

+
+ + +
+

Challenges

+
    +
  1. Write a function called analyze_all that takes a filename pattern as its sole argument and runs analyze for each file whose name matches the pattern.
  2. +
+
+ + +
+

Key Points

+
    +
  • Use for variable in collection to process the elements of a collection one at a time.
  • +
  • The body of a for loop must be indented.
  • +
  • Use len(thing) to determine the length of something that contains other values.
  • +
  • [value1, value2, value3, ...] creates a list.
  • +
  • Lists are indexed and sliced in the same way as strings and arrays.
  • +
  • Lists are mutable (i.e., their values can be changed in place).
  • +
  • Strings are immutable (i.e., the characters in them cannot be changed).
  • +
  • Use glob.glob(pattern) to create a list of files whose names match a pattern.
  • +
  • Use * in a pattern to match zero or more characters, and ? to match any single character.
  • +
+
+ + +
+

Next Steps

+

We have now solved our original problem: we can analyze any number of data files with a single command. More importantly, we have met two of the most important ideas in programming:

+
    +
  1. Use functions to make code easier to re-use and easier to understand.
  2. +
  3. Use lists and arrays to store related values, and loops to repeat operations on them.
  4. +
+

We have one more big idea to introduce, and then we will be able to go back and create a heat map like the one we initially used to display our first data set.

+
diff --git a/cached/novice/python/04-cond.md b/cached/novice/python/04-cond.md new file mode 100644 index 000000000..d104091c7 --- /dev/null +++ b/cached/novice/python/04-cond.md @@ -0,0 +1,698 @@ +--- +layout: lesson +root: ../.. +--- + +## Making Choices + + +
+

Our previous lessons have shown us how to manipulate data, define our own functions, and repeat things. However, the programs we have written so far always do the same things, regardless of what data they're given. We want programs to make choices based on the values they are manipulating. To help us see what decisions they're making, we'll start by looking at how computers manipulate images.

+
+ + +
+

Objectives

+
    +
  • Create a simple "image" made out of colored blocks.
  • +
  • Explain how the RGB model represents colors.
  • +
  • Explain the similarities and differences between tuples and lists.
  • +
  • Write conditional statements including if, elif, and else branches.
  • +
  • Correctly evaluate expressions containing and and or.
  • +
  • Correctly write and interpret code containing nested loops and conditionals.
  • +
  • Explain the advantages of putting frequently-modified code in a function.
  • +
+
+ +### Image Grids + + +
+

Let's start by creating some simple heat maps of our own using a library called ipythonblocks. The first step is to create our own "image":

+
+ + +
+
from ipythonblocks import ImageGrid
+
+ + +
+

Unlike the import statements we have seen earlier, this one doesn't load the entire ipythonblocks library. Instead, it just loads ImageGrid from that library, since that's the only thing we need (for now).

+

Once we have ImageGrid loaded, we can use it to create a very simple grid of colored cells:

+
+ + +
+
grid = ImageGrid(5, 3)
+grid.show()
+
+ +
+
+
+ + +
+

Just like a NumPy array, an ImageGrid has some properties that hold information about it:

+
+ + +
+
print 'grid width:', grid.width
+print 'grid height:', grid.height
+print 'grid lines on:', grid.lines_on
+
+ +
+
grid width: 5
+grid height: 3
+grid lines on: True
+
+
+ + +
+

The obvious thing to do with a grid like this is color in its cells, but in order to do that, we need to know how computers represent color. The most common schemes are RGB, which is short for "red, green, blue". RGB is an additive color model: every shade is some combination of red, green, and blue intensities. We can think of these three values as being the axes in a cube:

+

RGB Color Cube

+

An RGB color is an example of a multi-part value: like a Cartesian coordinate, it is one thing with several parts. We can represent such a value in Python using a tuple, which we write using parentheses instead of the square brackets used for a list:

+
+ + +
+
position = (12.3, 45.6)
+print 'position is:', position
+color = (10, 20, 30)
+print 'color is:', color
+
+ +
+
position is: (12.3, 45.6)
+color is: (10, 20, 30)
+
+
+ + +
+

We can select elements from tuples using indexing, just as we do with lists and arrays:

+
+ + +
+
print 'first element of color is:', color[0]
+
+ +
+
first element of color is: 10
+
+
+ + +
+

Unlike lists and arrays, though, tuples cannot be changed after they are created—in technical terms, they are immutable:

+
+ + +
+
color[0] = 40
+print 'first element of color after change:', color[0]
+
+ +
+
---------------------------------------------------------------------------
+TypeError                                 Traceback (most recent call last)
+<ipython-input-11-9c3dd30a4e52> in <module>()
+----> 1 color[0] = 40
+      2 print 'first element of color after change:', color[0]
+
+TypeError: 'tuple' object does not support item assignment
+
+ + +
+

If a tuple represents an RGB color, its red, green, and blue components can take on values between 0 and 255. The upper bound may seem odd, but it's the largest number that can be represented in an 8-bit byte (i.e., 28-1). This makes it easy for computers to manipulate colors, while providing fine enough gradations to fool most human eyes, most of the time.

+

Let's see what a few RGB colors actually look like:

+
+ + +
+
row = ImageGrid(8, 1)
+row[0, 0] = (0, 0, 0)   # no color => black
+row[1, 0] = (255, 255, 255) # all colors => white
+row[2, 0] = (255, 0, 0) # all red
+row[3, 0] = (0, 255, 0) # all green
+row[4, 0] = (0, 0, 255) # all blue
+row[5, 0] = (255, 255, 0) # red and green
+row[6, 0] = (255, 0, 255) # red and blue
+row[7, 0] = (0, 255, 255) # green and blue
+row.show()
+
+ +
+
+
+ + +
+

Simple color values like (0,255,0) are easy enough to decipher with a bit of practice, but what color is (214,90,127)? To help us, ipythonblocks provides a function called show_color:

+
+ + +
+
from ipythonblocks import show_color
+show_color(214, 90, 127)
+
+ +
+
+
+ + +
+

It also provides a table of standard colors:

+
+ + +
+
from ipythonblocks import colors
+c = ImageGrid(3, 2)
+c[0, 0] = colors['Fuchsia']
+c[0, 1] = colors['Salmon']
+c[1, 0] = colors['Orchid']
+c[1, 1] = colors['Lavender']
+c[2, 0] = colors['LimeGreen']
+c[2, 1] = colors['HotPink']
+c.show()
+
+ +
+
+
+ + +
+

Challenges

+
    +
  1. Fill in the ____ in the code below to create a bar that changes color from dark blue to black.

    +
    bar = ImageGrid(10, 1)
    +for x in range(10):
    +    bar[x, 0] = (0, 0, ____)
    +bar.show()
  2. +
  3. Why do computers use red, green, and blue as their primary colors?

  4. +
+
+ +### Conditionals + + +
+

The other thing we need in order to create a heat map of our own is a way to pick a color based on a data value. The tool Python gives us for doing this is called a conditional statement, and looks like this:

+
+ + +
+
num = 37
+if num > 100:
+    print 'greater'
+else:
+    print 'not greater'
+print 'done'
+
+ +
+
not greater
+done
+
+
+ + +
+

The second line of this code uses the keyword if to tell Python that we want to make a choice. If the test that follows it is true, the body of the if (i.e., the lines indented underneath it) are executed. If the test is false, the body of the else is executed instead. Only one or the other is ever executed:

+
+ + +
+

Executing a Conditional

+
+ + +
+

Conditional statements don't have to include an else. If there isn't one, Python simply does nothing if the test is false:

+
+ + +
+
num = 53
+print 'before conditional...'
+if num > 100:
+    print '53 is greater than 100'
+print '...after conditional'
+
+ +
+
before conditional...
+...after conditional
+
+
+ + +
+

We can also chain several tests together using elif, which is short for "else if". This makes it simple to write a function that returns the sign of a number:

+
+ + +
+
def sign(num):
+    if num > 0:
+        return 1
+    elif num == 0:
+        return 0
+    else:
+        return -1
+
+print 'sign of -3:', sign(-3)
+
+ +
+
sign of -3: -1
+
+
+ + +
+

One important thing to notice the code above is that we use a double equals sign == to test for equality rather than a single equals sign because the latter is used to mean assignment. This convention was inherited from C, and while many other programming languages work the same way, it does take a bit of getting used to...

+

We can also combine tests using and and or. and is only true if both parts are true:

+
+ + +
+
if (1 > 0) and (-1 > 0):
+    print 'both parts are true'
+else:
+    print 'one part is not true'
+
+ +
+
one part is not true
+
+
+ + +
+

while or is true if either part is true:

+
+ + +
+
if (1 < 0) or ('left' < 'right'):
+    print 'at least one test is true'
+
+ +
+
at least one test is true
+
+
+ + +
+

In this case, "either" means "either or both", not "either one or the other but not both".

+
+ + +
+

Challenges

+
    +
  1. True and False aren't the only values in Python that are true and false. In fact, any value can be used in an if or elif. After reading and running the code below, explain what the rule is for which values are considered true and which are considered false. (Note that if the body of a conditional is a single statement, we can write it on the same line as the if.)

    +
    if '': print 'empty string is true'
    +if 'word': print 'word is true'
    +if []: print 'empty list is true'
    +if [1, 2, 3]: print 'non-empty list is true'
    +if 0: print 'zero is true'
    +if 1: print 'one is true'
  2. +
  3. Write a function called near that returns True if its first parameter is within 10% of its second and False otherwise. Compare your implementation with your partner's: do you return the same answer for all possible pairs of numbers?

  4. +
+
+ +### Nesting + + +
+

Another thing to realize is that if statements can be combined with loops just as easily as they can be combined with functions. For example, if we want to sum the positive numbers in a list, we can write this:

+
+ + +
+
numbers = [-5, 3, 2, -1, 9, 6]
+total = 0
+for n in numbers:
+    if n >= 0:
+        total = total + n
+print 'sum of positive values:', total
+
+ +
+
sum of positive values: 20
+
+
+ + +
+

We could equally well calculate the positive and negative sums in a single loop:

+
+ + +
+
pos_total = 0
+neg_total = 0
+for n in numbers:
+    if n >= 0:
+        pos_total = pos_total + n
+    else:
+        neg_total = neg_total + n
+print 'negative and positive sums are:', neg_total, pos_total
+
+ +
+
negative and positive sums are: -6 20
+
+
+ + +
+

We can even put one loop inside another:

+
+ + +
+
for consonant in 'bcd':
+    for vowel in 'ae':
+        print consonant + vowel
+
+ +
+
ba
+be
+ca
+ce
+da
+de
+
+
+ + +
+

As the diagram below shows, the inner loop runs from start to finish each time the outer loop runs once:

+
+ + +
+

Execution of Nested Loops

+
+ + +
+

We can combine nesting and conditionals to create patterns in an image:

+
+ + +
+
square = ImageGrid(5, 5)
+for x in range(square.width):
+    for y in range(square.height):
+        if x < y:
+            square[x, y] = colors['Fuchsia']
+        elif x == y:
+            square[x, y] = colors['Olive']
+        else:
+            square[x, y] = colors['SlateGray']
+square.show()
+
+ +
+
+
+ + +
+

This is our first hand-made data visualization: the colors show where x is less than, equal to, or greater than y.

+
+ + +
+

Challenges

+
    +
  1. Will changing the nesting of the loops in the code above—i.e., wrapping the Y-axis loop around the X-axis loop—change the final image? Why or why not?

  2. +
  3. Python (and most other languages in the C family) provides in-place operators that work like this:

    +
    x = 1  # original value
    +x += 1 # add one to x, assigning result back to x
    +x *= 3 # multiply x by 3
    +print x
    +6
    +

    Rewrite the code that sums the positive and negative numbers in a list using in-place operators. Do you think the result is more or less readable than the original?

  4. +
+
+ +### Creating a Heat Map + + +
+

The last step is to turn our data into something we can see. As in previous lessons, the first step is to get the data into memory:

+
+ + +
+
import numpy as np
+data = np.loadtxt(fname='inflammation-01.csv', delimiter=',')
+print 'data shape:', data.shape
+
+ +
+
data shape: (60, 40)
+
+
+ + +
+

The second is to create an image grid that is the same size as the data:

+
+ + +
+
width, height = data.shape
+heatmap = ImageGrid(width, height)
+
+ + +
+

(The first line of the code above takes advantage of a neat trick: we can unpack the values in a tuple by assigning it to as many variables as it has entries.)

+

The third step is to decide how we are going to color the cells in the heat map. To keep things simple, we will use red, green, and blue as our colors, and compare data values to the data set's mean. Here's the code:

+
+ + +
+
for x in range(width):
+    for y in range(height):
+        if data[x, y] < data.mean():
+            heatmap[x, y] = colors['Red']
+        elif data[x, y] == data.mean():
+            heatmap[x, y] = colors['Green']
+        else:
+            heatmap[x, y] = colors['Blue']
+heatmap.show()
+
+ +
+
+
+ + +
+

This may be what we asked for, but both the image and the code are hideous:

+
    +
  1. It's too large for us to view the whole thing at once on a small laptop screen.
  2. +
  3. Our first heatmap had time along the X axis; this seems to have time along the Y axis.
  4. +
  5. Red against blue is pretty hard on the eyes.
  6. +
  7. The heatmap only shows two colors because none of the (integer) measurements has exactly the same value as the (fractional) mean.
  8. +
  9. We are calculating the mean of data either once or twice each time we go through the loop. That means that on a 40×60 data set, we are performing the same calculation 2400 times.
  10. +
+

Here's how we can improve it:

+
    +
  1. We can give ImageGrid an optional parameter block_size to set the size of each block.
  2. +
  3. We can transpose our data before creating the grid.
  4. +
  5. We can pick better colors (I'm personally fond of orchid, fuchsia, and hot pink).
  6. +
  7. Instead of checking if values are exactly equal to the mean, we can see if they are close to it.
  8. +
  9. We can calculate the mean once, before we start our loops, and use that value over and over.
  10. +
+

Our modified code looks like this:

+
+ + +
+
flipped = data.transpose()
+width, height = flipped.shape
+heatmap = ImageGrid(width, height, block_size=5)
+center = flipped.mean()
+for x in range(width):
+    for y in range(height):
+        if flipped[x, y] < (0.8 * center):
+            heatmap[x, y] = colors['Orchid']
+        elif flipped[x, y] > (1.2 * center):
+            heatmap[x, y] = colors['HotPink']
+        else:
+            heatmap[x, y] = colors['Fuchsia']
+heatmap.show()
+
+ +
+
+
+ + +
+

That's a bit better—but now the contrast between the colors isn't great enough. And there still aren't very many fuchsia cells: we may want to widen the band around the mean that gets that color.

+

We could rewrite our loop a third time, but the right thing to do is to put our code in a function so that we can experiment with bands and colors more easily.

+
+ + +
+
def make_heatmap(values, low_color, mid_color, high_color, low_band, high_band, block_size):
+    '''Make a 3-colored heatmap from a 2D array of data.'''
+    width, height = values.shape
+    result = ImageGrid(width, height, block_size=block_size)
+    center = values.mean()
+    for x in range(width):
+        for y in range(height):
+            if values[x, y] < low_band * center:
+                result[x, y] = low_color
+            elif values[x, y] > high_band * center:
+                result[x, y] = high_color
+            else:
+                result[x, y] = mid_color
+    return result
+
+ + +
+

To test this function, we'll run it with the settings we just used:

+
+ + +
+
h = make_heatmap(flipped, colors['Orchid'], colors['Fuchsia'], colors['HotPink'], 0.8, 1.2, 5)
+h.show()
+
+ +
+
+
+ + +
+

That seems right, so let's widen the band and use more dramatic colors:

+
+ + +
+
h = make_heatmap(flipped, colors['Gray'], colors['YellowGreen'], colors['SpringGreen'], 0.5, 1.5, 5)
+h.show()
+
+ +
+
+
+ + +
+

We'll probably want to experiment a bit more before publishing, but writing a function has made experimenting easy. We can make it even easier by re-defining our function one more time to give the parameters default values. While we're at it, let's put the low and high bands at the front, since they're more likely to change than our color choices:

+
+ + +
+
def make_heatmap(values,
+                 low_band=0.5, high_band=1.5,
+                 low_color=colors['Gray'], mid_color=colors['YellowGreen'], high_color=colors['SpringGreen'],
+                 block_size=5):
+    '''Make a 3-colored heatmap from a 2D array of data.
+    Default color scheme is gray to green.'''
+    width, height = values.shape
+    result = ImageGrid(width, height, block_size=block_size)
+    center = values.mean()
+    for x in range(width):
+        for y in range(height):
+            if values[x, y] < low_band * center:
+                result[x, y] = low_color
+            elif values[x, y] > high_band * center:
+                result[x, y] = high_color
+            else:
+                result[x, y] = mid_color
+    return result
+
+ + +
+

Once default values are added, the function's first line is too long to fit comfortably on our screen. Rather than breaking it wherever it hits the right edge of the screen, we have divided the parameters into logical groups to make it more readable.

+

Again, our first test is to re-run it with the same values as before (which we give it in a different order, since we've changed the order of parameters):

+
+ + +
+
h = make_heatmap(flipped, 0.5, 1.5, colors['Gray'], colors['YellowGreen'], colors['SpringGreen'], 5)
+h.show()
+
+ +
+
+
+ + +
+

We can now leave out everything except the data being visualized, or provide the data and the bands and re-use the default colors and block size:

+
+ + +
+
h = make_heatmap(flipped, 0.4, 1.6)
+h.show()
+
+ +
+
+
+ + +
+

We can now explore our data with just a few keystrokes, which means we can concentrate on our science and not on our programming.

+
+ + +
+

Challenges

+
    +
  1. Why did we transpose our data outside our heat map function? Why not have the function perform the transpose?

  2. +
  3. Why does the heat map function return the grid rather than displaying it immediately? Do you think this is a good or bad design choice?

  4. +
  5. Explain what the overall effect of this code is: ~ temp = left left = right right = temp ~ Compare it to: ~ left, right = right, left ~ Do they always do the same thing? Which do you find easier to read?

  6. +
+
+ + +
+

Key Points

+
    +
  • Use the ImageGrid class from the ipythonblocks library to create simple "images" made of colored blocks.
  • +
  • Specify colors use (red, green, blue) triples, each component of which is an integer in the range 0..255.
  • +
  • Use if condition to start a conditional statement, elif condition to provide additional tests, and else to provide a default.
  • +
  • The bodies of the branches of conditional statements must be indented.
  • +
  • Use == to test for equality.
  • +
  • X and Y is only true if both X and Y are true.
  • +
  • X or Y is true if either X or Y, or both, are true.
  • +
  • Zero, the empty string, and the empty list are considered false; all other numbers, strings, and lists are considered true.
  • +
  • Nest loops to operate on multi-dimensional data.
  • +
  • Put code whose parameters change frequently in a function, then call it with different parameter values to customize its behavior.
  • +
+
+ + +
+

Next Steps

+

Our final heatmap function is 17 lines long, which means that if there's a 95% chance of each line being correct, the odds of the whole function being right are only 41%. Before we go any further, we need to learn how to test whether our code is doing what we want it to do, and that will be the subject of the next lesson.

+
diff --git a/cached/novice/python/05-defensive.md b/cached/novice/python/05-defensive.md new file mode 100644 index 000000000..e22240111 --- /dev/null +++ b/cached/novice/python/05-defensive.md @@ -0,0 +1,420 @@ +--- +layout: lesson +root: ../.. +--- + +## Defensive Programming + + +
+

Our previous lessons have introduced the basic tools of programming: variables and lists, file I/O, loops, conditionals, and functions. What they haven't done is show us how to tell whether a program is getting the right answer, and how to tell if it's still getting the right answer as we make changes to it.

+

To achieve that, we need to:

+
    +
  • write programs that check their own operation,
  • +
  • write and run tests for widely-used functions, and
  • +
  • make sure we know what "correct" actually means.
  • +
+

The good news is, doing these things will speed up our programming, not slow it down. As in real carpentry—the kind done with lumber—the time saved by measuring carefully before cutting a piece of wood is much greater than the time that measuring takes.

+
+ + +
+

Objectives

+
    +
  • Explain what an assertion is.
  • +
  • Add assertions to programs that correctly check the program's state.
  • +
  • Correctly add precondition and postcondition assertions to functions.
  • +
  • Explain what test-driven development is, and use it when creating new functions.
  • +
  • Explain why variables should be initialized using actual data values rather than arbitrary constants.
  • +
  • Debug code containing an error systematically.
  • +
+
+ +### Assertions + + +
+

The first step toward getting the right answers from our programs is to assume that mistakes will happen and to guard against them. This is called defensive programming, and the most common way to do it is to add assertions to our code so that it checks itself as it runs. An assertion is simply a statement that something must be true at a certain point in a program. When Python sees one, it checks that the assertion's condition. If it's true, Python does nothing, but if it's false, Python halts the program immediately and prints the error message provided. For example, this piece of code halts as soon as the loop encounters a value that isn't positive:

+
+ + +
+
numbers = [1.5, 2.3, 0.7, -0.001, 4.4]
+total = 0.0
+for n in numbers:
+    assert n >= 0.0, 'Data should only contain positive values'
+    total += n
+print 'total is:', total
+
+ +
+
---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+<ipython-input-3-33d87ea29ae4> in <module>()
+      2 total = 0.0
+      3 for n in numbers:
+----> 4     assert n >= 0.0, 'Data should only contain positive values'
+      5     total += n
+      6 print 'total is:', total
+
+AssertionError: Data should only contain positive values
+
+ + +
+

Programs like the Firefox browser are full of assertions: 10-20% of the code they contain are there to check that the other 80-90% are working correctly. Broadly speaking, assertions fall into three categories:

+
    +
  • A precondition is something that must be true at the start of a function in order for it to work correctly.
  • +
  • A postcondition is something that the function guarantees is true when it finishes.
  • +
  • An invariant is something that is always true at a particular point inside a piece of code.
  • +
+

For example, suppose we are representing rectangles using a tuple of four coordinates (x0, y0, x1, y1). In order to do some calculations, we need to normalize the rectangle so that it is at the origin and 1.0 units long on its longest axis. This function does that, but checks that its input is correctly formatted and that its result makes sense:

+
+ + +
+
def normalize_rectangle(rect):
+    '''Normalizes a rectangle so that it is at the origin and 1.0 units long on its longest axis.'''
+    assert len(rect) == 4, 'Rectangles must contain 4 coordinates'
+    x0, y0, x1, y1 = rect
+    assert x0 < x1, 'Invalid X coordinates'
+    assert y0 < y1, 'Invalid Y coordinates'
+
+    dx = x1 - x0
+    dy = y1 - y0
+    if dx > dy:
+        scaled = float(dx) / dy
+        upper_x, upper_y = 1.0, scaled
+    else:
+        scaled = float(dx) / dy
+        upper_x, upper_y = scaled, 1.0
+
+    assert 0 < upper_x <= 1.0, 'Calculated upper X coordinate invalid'
+    assert 0 < upper_y <= 1.0, 'Calculated upper Y coordinate invalid'
+
+    return (0, 0, upper_x, upper_y)
+
+ + +
+

The preconditions on lines 2, 4, and 5 catch invalid inputs:

+
+ + +
+
print normalize_rectangle( (0.0, 1.0, 2.0) ) # missing the fourth coordinate
+
+ +
+
---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+<ipython-input-5-3a97b1dcab70> in <module>()
+----> 1 print normalize_rectangle( (0.0, 1.0, 2.0) ) # missing the fourth coordinate
+
+<ipython-input-4-9f8adbfdcfc9> in normalize_rectangle(rect)
+      1 def normalize_rectangle(rect):
+      2     '''Normalizes a rectangle so that it is at the origin and 1.0 units long on its longest axis.'''
+----> 3     assert len(rect) == 4, 'Rectangles must contain 4 coordinates'
+      4     x0, y0, x1, y1 = rect
+      5     assert x0 < x1, 'Invalid X coordinates'
+
+AssertionError: Rectangles must contain 4 coordinates
+
+ + +
+
print normalize_rectangle( (4.0, 2.0, 1.0, 5.0) ) # X axis inverted
+
+ +
+
---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+<ipython-input-6-f05ae7878a45> in <module>()
+----> 1 print normalize_rectangle( (4.0, 2.0, 1.0, 5.0) ) # X axis inverted
+
+<ipython-input-4-9f8adbfdcfc9> in normalize_rectangle(rect)
+      3     assert len(rect) == 4, 'Rectangles must contain 4 coordinates'
+      4     x0, y0, x1, y1 = rect
+----> 5     assert x0 < x1, 'Invalid X coordinates'
+      6     assert y0 < y1, 'Invalid Y coordinates'
+      7 
+
+AssertionError: Invalid X coordinates
+
+ + +
+

The post-conditions help us catch bugs by telling us when our calculations cannot have been correct. For example, if we normalize a rectangle that is taller than it is wide everything seems OK:

+
+ + +
+
print normalize_rectangle( (0.0, 0.0, 1.0, 5.0) )
+
+ +
+
(0, 0, 0.2, 1.0)
+
+
+ + +
+

but if we normalize one that's wider than it is tall, the assertion is triggered:

+
+ + +
+
print normalize_rectangle( (0.0, 0.0, 5.0, 1.0) )
+
+ +
+
---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+<ipython-input-8-5f0ef7954aeb> in <module>()
+----> 1 print normalize_rectangle( (0.0, 0.0, 5.0, 1.0) )
+
+<ipython-input-4-9f8adbfdcfc9> in normalize_rectangle(rect)
+     16 
+     17     assert 0 < upper_x <= 1.0, 'Calculated upper X coordinate invalid'
+---> 18     assert 0 < upper_y <= 1.0, 'Calculated upper Y coordinate invalid'
+     19 
+     20     return (0, 0, upper_x, upper_y)
+
+AssertionError: Calculated upper Y coordinate invalid
+
+ + +
+

Re-reading our function, we realize that line 10 should divide dy by dx rather than dx by dy. (You can display line numbers by typing Ctrl-M, then L.) If we had left out the assertion at the end of the function, we would have created and returned something that had the right shape as a valid answer, but wasn't. Detecting and debugging that would almost certainly have taken more time in the long run than writing the assertion.

+

But assertions aren't just about catching errors: they also help people understand programs. Each assertion gives the person reading the program a chance to check (consciously or otherwise) that their understanding matches what the code is doing.

+

Most good programmers follow two rules when adding assertions to their code. The first is, "fail early, fail often". The greater the distance between when and where an error occurs and when it's noticed, the harder the error will be to debug, so good code catches mistakes as early as possible.

+

The second rule is, "turn bugs into assertions or tests". If you made a mistake in a piece of code, the odds are good that you have made other mistakes nearby, or will make the same mistake (or a related one) the next time you change it. Writing assertions to check that you haven't regressed (i.e., haven't re-introduced an old problem) can save a lot of time in the long run, and helps to warn people who are reading the code (including your future self) that this bit is tricky.

+
+ + +
+

Challenges

+
    +
  1. Suppose you are writing a function called average that calculates the average of the numbers in a list. What pre-conditions and post-conditions would you write for it? Compare your answer to your neighbor's: can you think of a function that will past your tests but not hers or vice versa?

  2. +
  3. Explain in words what the assertions in this code check, and for each one, give an example of input that will make that assertion fail.

    +
    def running(values):
    +    assert len(values) > 0
    +    result = [values[0]]
    +    for v in values[1:]:
    +        assert result[-1] >= 0
    +        result.append(result[-1] + v)
    +    assert result[-1] >= result[0]
    +    return result
  4. +
+
+ +### Test-Driven Development + + +
+

An assertion checks that something is true at a particular point in the program. The next step is to check the overall behavior of a piece of code, i.e., to make sure that it produces the right output when it's given a particular input. For example, suppose we need to find where two or more time series overlap. The range of each time series is represented as a pair of numbers, which are the time the interval started and ended. The output is the largest range that they all include:

+
+ + +
+

Overlapping Ranges

+
+ + +
+

Most novice programmers would solve this problem like this:

+
    +
  1. Write a function range_overlap.
  2. +
  3. Call it interactively on two or three different inputs.
  4. +
  5. If it produces the wrong answer, fix the function and re-run that test.
  6. +
+

This clearly works—after all, thousands of scientists are doing it right now—but there's a better way:

+
    +
  1. Write a short function for each test.
  2. +
  3. Write a range_overlap function that should pass those tests.
  4. +
  5. If range_overlap produces any wrong answers, fix it and re-run the test functions.
  6. +
+

Writing the tests before writing the function they exercise is called test-driven development (TDD). Its advocates believe it produces better code faster because:

+
    +
  1. If people write tests after writing the thing to be tested, they are subject to confirmation bias, i.e., they subconsciously write tests to show that their code is correct, rather than to find errors.
  2. +
  3. Writing tests helps programmers figure out what the function is actually supposed to do.
  4. +
+

Here are three test functions for range_overlap:

+
+ + +
+
assert range_overlap([ (0.0, 1.0) ]) == (0.0, 1.0)
+assert range_overlap([ (0.0, 1.0), (0.0, 2.0) ]) == (0.0, 1.0)
+assert range_overlap([ (0.0, 1.0), (0.0, 2.0), (-1.0, 1.0) ]) == (0.0, 1.0)
+
+ + +
+

The error is actually reassuring: we haven't written range_overlap yet, so if the tests passed, it would be a sign that someone else had and that we were accidentally using their function.

+

And as a bonus of writing these tests, we've implicitly defined what our input and output look like: we expect a list of pairs as input, and produce a single pair as output.

+

Something important is missing, though. We don't have any tests for the case where the ranges don't overlap at all:

+
assert range_overlap([ (0.0, 1.0), (5.0, 6.0) ]) == ???
+

What should range_overlap do in this case: fail with an error message, produce a special value like (0.0, 0.0) to signal that there's no overlap, or something else? Any actual implementation of the function will do one of these things; writing the tests first helps us figure out which is best before we're emotionally invested in whatever we happened to write before we realized there was an issue.

+

And what about this case?

+
assert range_overlap([ (0.0, 1.0), (1.0, 2.0) ]) == ???
+

Do two segments that touch at their endpoints overlap or not? Mathematicians usually say "yes", but engineers usually say "no". The best answer is "whatever is most useful in the rest of our program", but again, any actual implementation of range_overlap is going to do something, and whatever it is ought to be consistent with what it does when there's no overlap at all.

+

Since we're planning to use the range this function returns as the X axis in a time series chart, we decide that:

+
    +
  1. every overlap has to have non-zero width, and
  2. +
  3. we will return the special value None when there's no overlap.
  4. +
+

None is built into Python, and means "nothing here". (Other languages often call the equivalent value null or nil). With that decision made, we can finish writing our last two tests:

+
+ + +
+
assert range_overlap([ (0.0, 1.0), (5.0, 6.0) ]) == None
+assert range_overlap([ (0.0, 1.0), (1.0, 2.0) ]) == None
+
+ +
+
---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+<ipython-input-10-d877ef460ba2> in <module>()
+----> 1 assert range_overlap([ (0.0, 1.0), (5.0, 6.0) ]) == None
+      2 assert range_overlap([ (0.0, 1.0), (1.0, 2.0) ]) == None
+
+AssertionError: 
+
+ + +
+

Again, we get an error because we haven't written our function, but we're now ready to do so:

+
+ + +
+
def range_overlap(ranges):
+    '''Return common overlap among a set of [low, high] ranges.'''
+    lowest = 0.0
+    highest = 1.0
+    for (low, high) in ranges:
+        lowest = max(lowest, low)
+        highest = min(highest, high)
+    return (lowest, highest)
+
+ + +
+

(Take a moment to think about why we use max to raise lowest and min to lower highest.) We'd now like to re-run our tests, but they're scattered across three different cells. To make running them easier, let's put them all in a function:

+
+ + +
+
def test_range_overlap():
+    assert range_overlap([ (0.0, 1.0) ]) == (0.0, 1.0)
+    assert range_overlap([ (0.0, 1.0), (0.0, 2.0) ]) == (0.0, 1.0)
+    assert range_overlap([ (0.0, 1.0), (0.0, 2.0), (-1.0, 1.0) ]) == (0.0, 1.0)
+    assert range_overlap([ (0.0, 1.0), (5.0, 6.0) ]) == None
+    assert range_overlap([ (0.0, 1.0), (1.0, 2.0) ]) == None
+
+ + +
+

We can now test range_overlap with a single function call:

+
+ + +
+
test_range_overlap()
+
+ +
+
---------------------------------------------------------------------------
+AssertionError                            Traceback (most recent call last)
+<ipython-input-13-cf9215c96457> in <module>()
+----> 1 test_range_overlap()
+
+<ipython-input-12-34c3659163fc> in test_range_overlap()
+      3     assert range_overlap([ (0.0, 1.0), (0.0, 2.0) ]) == (0.0, 1.0)
+      4     assert range_overlap([ (0.0, 1.0), (0.0, 2.0), (-1.0, 1.0) ]) == (0.0, 1.0)
+----> 5     assert range_overlap([ (0.0, 1.0), (5.0, 6.0) ]) == None
+      6     assert range_overlap([ (0.0, 1.0), (1.0, 2.0) ]) == None
+
+AssertionError: 
+
+ + +
+

The first of the tests that was supposed to produce None fails, so we know there's something wrong with our function. What we don't know, though, is whether the last of our five tests passed or failed, because Python halted the program as soon as it spotted the first error. Still, some information is better than none, and if we trace the behavior of the function with that input, we realize that we're initializing lowest and highest to 0.0 and 1.0 respectively, regardless of the input values. This violates another important rule of programming: "always initialize from data". We'll leave it as an exercise to fix range_overlap.

+
+ + +
+

Challenges

+
    +
  1. Fix range_overlap. Re-run test_range_overlap after each change you make.
  2. +
+
+ +### Debugging + + +
+

Once testing has uncovered problems, the next step is to fix them. Many novices do this by making more-or-less random changes to their code until it seems to produce the right answer, but that's very inefficient (and the result is usually only correct for the one case they're testing). The more experienced a programmer is, the more systematically they debug, and most follow some variation on the rules explained below.

+

Know What It's Supposed to Do

+

The first step in debugging something is to know what it's supposed to do. "My program doesn't work" isn't good enough: in order to diagnose and fix problems, we need to be able to tell correct output from incorrect. If we can write a test case for the failing case—i.e., if we can assert that with these inputs, the function should produce that result— then we're ready to start debugging. If we can't, then we need to figure out how we're going to know when we've fixed things.

+

But writing test cases for scientific software is frequently harder than writing test cases for commercial applications, because if we knew what the output of the scientific code was supposed to be, we wouldn't be running the software: we'd be writing up our results and moving on to the next program. In practice, scientists tend to do the following:

+
    +
  1. Test with simplified data. Before doing statistics on a real data set, we should try calculating statistics for a single record, for two identical records, for two records whose values are one step apart, or for some other case where we can calculate the right answer by hand.

  2. +
  3. Test a simplified case. If our program is supposed to simulate magnetic eddies in rapidly-rotating blobs of supercooled helium, our first test should be a blob of helium that isn't rotating, and isn't being subjected to any external electromagnetic fields. Similarly, if we're looking at the effects of climate change on speciation, our first test should hold temperature, precipitation, and other factors constant.

  4. +
  5. Compare to an oracle. A test oracle is something—experimental data, an older program whose results are trusted, or even a human expert—against which we can compare the results of our new program. If we have a test oracle, we should store its output for particular cases so that we can compare it with our new results as often as we like without re-running that program.

  6. +
  7. Check conservation laws. Mass, energy, and other quantitites are conserved in physical systems, so they should be in programs as well. Similarly, if we are analyzing patient data, the number of records should either stay the same or decrease as we move from one analysis to the next (since we might throw away outliers or records with missing values). If "new" patients start appearing out of nowhere as we move through our pipeline, it's probably a sign that something is wrong.

  8. +
  9. Visualize. Data analysts frequently use simple visualizations to check both the science they're doing and the correctness of their code (just as we did in the opening lesson of this tutorial). This should only be used for debugging as a last resort, though, since it's very hard to compare two visualizations automatically.

  10. +
+

Make It Fail Every Time

+

We can only debug something when it fails, so the second step is always to find a test case that makes it fail every time. The "every time" part is important because few things are more frustrating than debugging an intermittent problem: if we have to call a function a dozen times to get a single failure, the odds are good that we'll scroll past the failure when it actually occurs.

+

As part of this, it's always important to check that our code is "plugged in", i.e., that we're actually exercising the problem that we think we are. Every programmer has spent hours chasing a bug, only to realize that they were actually calling their code on the wrong data set or with the wrong configuration parameters, or are using the wrong version of the software entirely. Mistakes like these are particularly likely to happen when we're tired, frustrated, and up against a deadline, which is one of the reasons late-night (or overnight) coding sessions are almost never worthwhile.

+

Make It Fail Fast

+

If it takes 20 minutes for the bug to surface, we can only do three experiments an hour. That doesn't must mean we'll get less data in more time: we're also more likely to be distracted by other things as we wait for our program to fail, which means the time we are spending on the problem is less focused. It's therefore critical to make it fail fast.

+

As well as making the program fail fast in time, we want to make it fail fast in space, i.e., we want to localize the failure to the smallest possible region of code:

+
    +
  1. The smaller the gap between cause and effect, the easier the connection is to find. Many programmers therefore use a divide and conquer strategy to find bugs, i.e., if the output of a function is wrong, they check whether things are OK in the middle, then concentrate on either the first or second half, and so on.

  2. +
  3. N things can interact in N2/2 different ways, so every line of code that isn't run as part of a test means more than one thing we don't need to worry about.

  4. +
+

Change One Thing at a Time, For a Reason

+

Replacing random chunks of code is unlikely to do much good. (After all, if you got it wrong the first time, you'll probably get it wrong the second and third as well.) Good programmers therefore change one thing at a time, for a reason They are either trying to gather more information ("is the bug still there if we change the order of the loops?") or test a fix ("can we make the bug go away by sorting our data before processing it?").

+

Every time we make a change, however small, we should re-run our tests immediately, because the more things we change at once, the harder it is to know what's responsible for what (those N2 interactions again). And we should re-run all of our tests: more than half of fixes made to code introduce (or re-introduce) bugs, so re-running all of our tests tells us whether we have regressed.

+

Keep Track of What You've Done

+

Good scientists keep track of what they've done so that they can reproduce their work, and so that they don't waste time repeating the same experiments or running ones whose results won't be interesting. Similarly, debugging works best when we keep track of what we've done and how well it worked. If we find ourselves asking, "Did left followed by right with an odd number of lines cause the crash? Or was it right followed by left? Or was I using an even number of lines?" then it's time to step away from the computer, take a deep breath, and start working more systematically.

+

Records are particularly useful when the time comes to ask for help. People are more likely to listen to us when we can explain clearly what we did, and we're better able to give them the information they need to be useful.

+
+

Version Control Revisited

+

Version control is often used to reset software to a known state during debugging, and to explore recent changes to code that might be responsible for bugs. In particular, most version control systems have a blame command that will show who last changed particular lines of code...

+
+

Be Humble

+

And speaking of help: if we can't find a bug in 10 minutes, we should be humble and ask for help. Just explaining the problem aloud is often useful, since hearing what we're thinking helps us spot inconsistencies and hidden assumptions.

+

Asking for help also helps alleviate confirmation bias. If we have just spent an hour writing a complicated program, we want it to work, so we're likely to keep telling ourselves why it should, rather than searching for the reason it doesn't. People who aren't emotionally invested in the code can be more objective, which is why they're often able to spot the simple mistakes we have overlooked.

+

Part of being humble is learning from our mistakes. Programmers tend to get the same things wrong over and over: either they don't understand the language and libraries they're working with, or their model of how things work is wrong. In either case, taking note of why the error occurred and checking for it next time quickly turns into not making the mistake at all.

+

And that is what makes us most productive in the long run. As the saying goes, "A week of hard work can sometimes save you an hour of thought." If we train ourselves to avoid making some kinds of mistakes, to break our code into modular, testable chunks, and to turn every assumption (or mistake) into an assertion, it will actually take us less time to produce working programs, not more.

+
+ + +
+

Key Points

+
    +
  • Program defensively, i.e., assume that errors are going to arise, and write code to detect them when they do.
  • +
  • Put assertions in programs to check their state as they run, and to help readers understand how those programs are supposed to work.
  • +
  • Use preconditions to check that the inputs to a function are safe to use.
  • +
  • Use postconditions to check that the output from a function is safe to use.
  • +
  • Write tests before writing code in order to help determine exactly what that code is supposed to do.
  • +
  • Know what code is supposed to do before trying to debug it.
  • +
  • Make it fail every time.
  • +
  • Make it fail fast.
  • +
  • Change one thing at a time, and for a reason.
  • +
  • Keep track of what you've done.
  • +
  • Be humble.
  • +
+
+ + +
+

Next Steps

+

We have now seen the basics of building and testing Python code in the IPython Notebook. The last thing we need to learn is how to build command-line programs that we can use in pipelines and shell scripts, so that we can integrate our tools with other people's work. This will be the subject of our next and final lesson.

+
diff --git a/cached/novice/python/06-cmdline.md b/cached/novice/python/06-cmdline.md new file mode 100644 index 000000000..99ba33921 --- /dev/null +++ b/cached/novice/python/06-cmdline.md @@ -0,0 +1,660 @@ +--- +layout: lesson +root: ../.. +--- + +## Command-Line Programs + + +
+

The IPython Notebook and other interactive tools are great for prototyping code and exploring data, but sooner or later we will want to use our program in a pipeline or run it in a shell script to process thousands of data files. In order to do that, we need to make our programs work like other Unix command-line tools. For example, we may want a program that reads a data set and prints the average inflammation per patient:

+
$ python readings.py --mean inflammation-01.csv
+5.45
+5.425
+6.1
+...
+6.4
+7.05
+5.9
+

but we might also want to look at the minimum of the first four lines

+
$ head -4 inflammation-01.csv | python readings.py --min
+

or the maximum inflammations in several files one after another:

+
$ python readings.py --max inflammation-*.csv
+

Our overall requirements are:

+
    +
  1. If no filename is given on the command line, read data from standard input.
  2. +
  3. If one or more filenames are given, read data from them and report statistics for each file separately.
  4. +
  5. Use the --min, --mean, or --max flag to determine what statistic to print.
  6. +
+

To make this work, we need to know how to handle command-line arguments in a program, and how to get at standard input. We'll tackle these questions in turn below.

+
+ + +
+

Objectives

+
    +
  • Use the values of command-line arguments in a program.
  • +
  • Handle flags and files separately in a command-line program.
  • +
  • Read data from standard input in a program so that it can be used in a pipeline.
  • +
+
+ +### Command-Line Arguments + + +
+

Using the text editor of your choice, save the following in a text file:

+
+ + +
+
!cat sys-version.py
+
+ +
+
import sys
+print 'version is', sys.version
+
+
+ + +
+

The first line imports a library called sys, which is short for "system". It defines values such as sys.version, which describes which version of Python we are running. We can run this script from within the IPython Notebook like this:

+
+ + +
+
%run sys-version.py
+
+ +
+
version is 2.7.5 |Anaconda 1.8.0 (x86_64)| (default, Oct 24 2013, 07:02:20) 
+[GCC 4.0.1 (Apple Inc. build 5493)]
+
+
+ + +
+

or like this:

+
+ + +
+
!ipython sys-version.py
+
+ +
+
version is 2.7.5 |Anaconda 1.8.0 (x86_64)| (default, Oct 24 2013, 07:02:20) 
+[GCC 4.0.1 (Apple Inc. build 5493)]
+
+
+ + +
+

The first method, %run, uses a special command in the IPython Notebook to run a program in a .py file. The second method is more general: the exclamation mark ! tells the Notebook to run a shell command, and it just so happens that the command we run is ipython with the name of the script.

+
+ + +
+

Here's another script that does something more interesting:

+
+ + +
+
!cat argv-list.py
+
+ +
+
import sys
+print 'sys.argv is', sys.argv
+
+
+ + +
+

The strange name argv stands for "argument values". Whenever Python runs a program, it takes all of the values given on the command line and puts them in the list sys.argv so that the program can determine what they were. If we run this program with no arguments:

+
+ + +
+
!ipython argv-list.py
+
+ +
+
sys.argv is ['/Users/gwilson/s/bc/python/novice/argv-list.py']
+
+
+ + +
+

the only thing in the list is the full path to our script, which is always sys.argv[0]. If we run it with a few arguments, however:

+
+ + +
+
!ipython argv-list.py first second third
+
+ +
+
sys.argv is ['/Users/gwilson/s/bc/python/novice/argv-list.py', 'first', 'second', 'third']
+
+
+ + +
+

then Python adds each of those arguments to that magic list.

+
+ + +
+

With this in hand, let's build a version of readings.py that always prints the per-patient mean of a single data file. The first step is to write a function that outlines our implementation, and a placeholder for the function that does the actual work. By convention this function is usually called main, though we can call it whatever we want:

+
+ + +
+
!cat readings-01.py
+
+ +
+
import sys
+import numpy as np
+
+def main():
+    script = sys.argv[0]
+    filename = sys.argv[1]
+    data = np.loadtxt(filename, delimiter=',')
+    for m in data.mean(axis=1):
+        print m
+
+
+ + +
+

This function gets the name of the script from sys.argv[0], because that's where it's always put, and the name of the file to process from sys.argv[1]. Here's a simple test:

+
+ + +
+
%run readings-01.py inflammation-01.csv
+
+ + +
+

There is no output because we have defined a function, but haven't actually called it. Let's add a call to main:

+
+ + +
+
!cat readings-02.py
+
+ +
+
import sys
+import numpy as np
+
+def main():
+    script = sys.argv[0]
+    filename = sys.argv[1]
+    data = np.loadtxt(filename, delimiter=',')
+    for m in data.mean(axis=1):
+        print m
+
+main()
+
+
+ + +
+

and run that:

+
+ + +
+
%run readings-02.py inflammation-01.csv
+
+ +
+
5.45
+5.425
+6.1
+5.9
+5.55
+6.225
+5.975
+6.65
+6.625
+6.525
+6.775
+5.8
+6.225
+5.75
+5.225
+6.3
+6.55
+5.7
+5.85
+6.55
+5.775
+5.825
+6.175
+6.1
+5.8
+6.425
+6.05
+6.025
+6.175
+6.55
+6.175
+6.35
+6.725
+6.125
+7.075
+5.725
+5.925
+6.15
+6.075
+5.75
+5.975
+5.725
+6.3
+5.9
+6.75
+5.925
+7.225
+6.15
+5.95
+6.275
+5.7
+6.1
+6.825
+5.975
+6.725
+5.7
+6.25
+6.4
+7.05
+5.9
+
+
+ + +
+
+

The Right Way to Do It

+

If our programs can take complex parameters or multiple filenames, we shouldn't handle sys.argv directly. Instead, we should use Python's argparse library, which handles common cases in a systematic way, and also makes it easy for us to provide sensible error messages for our users.

+
+
+ + +
+

Challenges

+
    +
  1. Write a command-line program that does addition and subtraction: ~ python arith.py 1 + 2 3 python arith.py 3 - 4 -1 ~

    +

    What goes wrong if you try to add multiplication using '*' to the program?

  2. +
  3. Using the glob module introduced 03-loop.ipynb, write a simple version of ls that shows files in the current directory with a particular suffix: ~ python my_ls.py py left.py right.py zero.py ~

  4. +
+
+ +### Handling Multiple Files + + +
+

The next step is to teach our program how to handle multiple files. Since 60 lines of output per file is a lot to page through, we'll start by creating three smaller files, each of which has three days of data for two patients:

+
+ + +
+
!ls small-*.csv
+
+ +
+
small-01.csv small-02.csv small-03.csv
+
+
+ + +
+
!cat small-01.csv
+
+ +
+
0,0,1
+0,1,2
+
+
+ + +
+
%run readings-02.py small-01.csv
+
+ +
+
0.333333333333
+1.0
+
+
+ + +
+

Using small data files as input also allows us to check our results more easily: here, for example, we can see that our program is calculating the mean correctly for each line, whereas we were really taking it on faith before. This is yet another rule of programming: "test the simple things first".

+

We want our program to process each file separately, so we need a looop that executes once for each filename. If we specify the files on the command line, the filenames will be in sys.argv, but we need to be careful: sys.argv[0] will always be the name of our script, rather than the name of a file. We also need to handle an unknown number of filenames, since our program could be run for any number of files.

+

The solution to both problems is to loop over the contents of sys.argv[1:]. The '1' tells Python to start the slice at location 1, so the program's name isn't included; since we've left off the upper bound, the slice runs to the end of the list, and includes all the filenames. Here's our changed program:

+
+ + +
+
!cat readings-03.py
+
+ +
+
import sys
+import numpy as np
+
+def main():
+    script = sys.argv[0]
+    for filename in sys.argv[1:]:
+        data = np.loadtxt(filename, delimiter=',')
+        for m in data.mean(axis=1):
+            print m
+
+main()
+
+
+ + +
+

and here it is in action:

+
+ + +
+
%run readings-03.py small-01.csv small-02.csv
+
+ +
+
0.333333333333
+1.0
+13.6666666667
+11.0
+
+
+ + +
+

Note: at this point, we have created three versions of our script called readings-01.py, readings-02.py, and readings-03.py. We wouldn't do this in real life: instead, we would have one file called readings.py that we committed to version control every time we got an enhancement working. For teaching, though, we need all the successive versions side by side.

+
+ + +
+

Challenges

+
    +
  1. Write a program called check.py that takes the names of one or more inflammation data files as arguments and checks that all the files have the same number of rows and columns. What is the best way to test your program?
  2. +
+
+ +### Handling Command-Line Flags + + +
+

The next step is to teach our program to pay attention to the --min, --mean, and --max flags. These always appear before the names of the files, so we could just do this:

+
+ + +
+
!cat readings-04.py
+
+ +
+
import sys
+import numpy as np
+
+def main():
+    script = sys.argv[0]
+    action = sys.argv[1]
+    filenames = sys.argv[2:]
+
+    for f in filenames:
+        data = np.loadtxt(f, delimiter=',')
+
+        if action == '--min':
+            values = data.min(axis=1)
+        elif action == '--mean':
+            values = data.mean(axis=1)
+        elif action == '--max':
+            values = data.max(axis=1)
+
+        for m in values:
+            print m
+
+main()
+
+
+ + +
+

This works:

+
+ + +
+
%run readings-04.py --max small-01.csv
+
+ +
+
1.0
+2.0
+
+
+ + +
+

but there are seveal things wrong with it:

+
    +
  1. main is too large to read comfortably.

  2. +
  3. If action isn't one of the three recognized flags, the program loads each file but does nothing with it (because none of the branches in the conditional match). Silent failures like this are always hard to debug.

  4. +
+

This version pulls the processing of each file out of the loop into a function of its own. It also checks that action is one of the allowed flags before doing any processing, so that the program fails fast:

+
+ + +
+
!cat readings-05.py
+
+ +
+
import sys
+import numpy as np
+
+def main():
+    script = sys.argv[0]
+    action = sys.argv[1]
+    filenames = sys.argv[2:]
+    assert action in ['--min', '--mean', '--max'], \
+           'Action is not one of --min, --mean, or --max: ' + action
+    for f in filenames:
+        process(f, action)
+
+def process(filename, action):
+    data = np.loadtxt(filename, delimiter=',')
+
+    if action == '--min':
+        values = data.min(axis=1)
+    elif action == '--mean':
+        values = data.mean(axis=1)
+    elif action == '--max':
+        values = data.max(axis=1)
+
+    for m in values:
+        print m
+
+main()
+
+
+ + +
+

This is four lines longer than its predecessor, but broken into more digestible chunks of 8 and 12 lines.

+
+ + +
+

Python has a module named argparse that helps handle complex command-line flags. We will not cover this module in this lesson but you can go to Tshepang Lekhonkhobe's Argparse tutorial that is part of Python's Official Documentation.

+
+ + +
+

Challenges

+
    +
  1. Rewrite this program so that it uses -n, -m, and -x instead of --min, --mean, and --max respectively. Is the code easier to read? Is the program easier to understand?

  2. +
  3. Separately, modify the program so that if no parameters are given (i.e., no action is specified and no filenames are given), it prints a message explaining how it should be used.

  4. +
  5. Separately, modify the program so that if no action is given it displays the means of the data.

  6. +
+
+ +### Handling Standard Input + + +
+

The next thing our program has to do is read data from standard input if no filenames are given so that we can put it in a pipeline, redirect input to it, and so on. Let's experiment in another script:

+
+ + +
+
!cat count-stdin.py
+
+ +
+
import sys
+
+count = 0
+for line in sys.stdin:
+    count += 1
+
+print count, 'lines in standard input'
+
+
+ + +
+

This little program reads lines from a special "file" called sys.stdin, which is automatically connected to the program's standard input. We don't have to open it—Python and the operating system take care of that when the program starts up— but we can do almost anything with it that we could do to a regular file. Let's try running it as if it were a regular command-line program:

+
+ + +
+
!ipython count-stdin.py < small-01.csv
+
+ +
+
2 lines in standard input
+
+
+ + +
+

What if we run it using %run?

+
+ + +
+
%run count-stdin.py < fractal_1.txt
+
+ +
+
0 lines in standard input
+
+
+ + +
+

As you can see, %run doesn't understand file redirection: that's a shell thing.

+

A common mistake is to try to run something that reads from standard input like this:

+
!ipython count_stdin.py fractal_1.txt
+

i.e., to forget the < character that redirect the file to standard input. In this case, there's nothing in standard input, so the program waits at the start of the loop for someone to type something on the keyboard. Since there's no way for us to do this, our program is stuck, and we have to halt it using the Interrupt option from the Kernel menu in the Notebook.

+

We now need to rewrite the program so that it loads data from sys.stdin if no filenames are provided. Luckily, numpy.loadtxt can handle either a filename or an open file as its first parameter, so we don't actually need to change process. That leaves main:

+
+ + +
+
def main():
+    script = sys.argv[0]
+    action = sys.argv[1]
+    filenames = sys.argv[2:]
+    assert action in ['--min', '--mean', '--max'], \
+           'Action is not one of --min, --mean, or --max: ' + action
+    if len(filenames) == 0:
+        process(sys.stdin, action)
+    else:
+        for f in filenames:
+            process(f, action)
+
+ + +
+

Let's try it out (we'll see in a moment why we send the output through head):

+
+ + +
+
!ipython readings-06.py --mean < small-01.csv | head -10
+
+ +
+
[TerminalIPythonApp] CRITICAL | Bad config encountered during initialization:
+[TerminalIPythonApp] CRITICAL | Unrecognized flag: '--mean'
+=========
+ IPython
+=========
+
+Tools for Interactive Computing in Python
+=========================================
+
+    A Python shell with automatic history (input and output), dynamic object
+    introspection, easier configuration, command completion, access to the
+    system shell and more.  IPython can also be embedded in running programs.
+
+
+ + +
+

Whoops: why are we getting IPython's help rather than the line-by-line average of our data? The answer is that IPython has a hard time telling which command-line arguments are meant for it, and which are meant for the program it's running. To make our meaning clear, we have to use -- (a double dash) to separate the two:

+
+ + +
+
!ipython readings-06.py -- --mean < small-01.csv
+
+ +
+
0.333333333333
+1.0
+
+
+ + +
+

That's better. In fact, that's done: the program now does everything we set out to do.

+
+ + +
+

Challenges

+
    +
  1. Write a program called line-count.py that works like the Unix wc command: +
      +
    • If no filenames are given, it reports the number of lines in standard input.
    • +
    • If one or more filenames are given, it reports the number of lines in each, followed by the total number of lines.
    • +
  2. +
+
+ + +
+

Key Points

+
    +
  • The sys library connects a Python program to the system it is running on.
  • +
  • The list sys.argv contains the command-line arguments that a program was run with.
  • +
  • Avoid silent failures.
  • +
  • The "file" sys.stdin connects to a program's standard input.
  • +
  • The "file" sys.stdout connects to a program's standard output.
  • +
+
diff --git a/cached/novice/sql/01-select.md b/cached/novice/sql/01-select.md new file mode 100644 index 000000000..c14458166 --- /dev/null +++ b/cached/novice/sql/01-select.md @@ -0,0 +1,532 @@ +--- +layout: lesson +root: ../.. +--- + +## Selecting Data + + +
+

In the late 1920s and early 1930s, William Dyer, Frank Pabodie, and Valentina Roerich led expeditions to the Pole of Inaccessibility in the South Pacific, and then onward to Antarctica. Two years ago, their expeditions were found in a storage locker at Miskatonic University. We have scanned and OCR'd the data they contain, and we now want to store that information in a way that will make search and analysis easy.

+

We basically have three options: text files, a spreadsheet, or a database. Text files are easiest to create, and work well with version control, but then we would then have to build search and analysis tools ourselves. Spreadsheets are good for doing simple analysis, they don't handle large or complex data sets very well. We would therefore like to put this data in a database, and these lessons will show how to do that.

+
+ + +
+

Objectives

+
    +
  • Explain the difference between a table, a record, and a field.
  • +
  • Explain the difference between a database and a database manager.
  • +
  • Write a query to select all values for specific fields from a single table.
  • +
+
+ +### A Few Definitions + + +
+

A relational database is a way to store and manipulate information that is arranged as tables. Each table has columns (also known as fields) which describe the data, and rows (also known as records) which contain the data.

+

When we are using a spreadsheet, we put formulas into cells to calculate new values based on old ones. When we are using a database, we send commands (usually called queries) to a database manager: a program that manipulates the database for us. The database manager does whatever lookups and calculations the query specifies, returning the results in a tabular form that we can then use as a starting point for further queries.

+
+

Every database manager—Oracle, IBM DB2, PostgreSQL, MySQL, Microsoft Access, and SQLite—stores data in a different way, so a database created with one cannot be used directly by another. However, every database manager can import and export data in a variety of formats, so it is possible to move information from one to another.

+
+

Queries are written in a language called SQL, which stands for "Structured Query Language". SQL provides hundreds of different ways to analyze and recombine data; we will only look at a handful, but that handful accounts for most of what scientists do.

+

The tables below show the database we will use in our examples:

+
+ + +
+ + + + + +
+

Person: people who took readings.

+ + + + + + + +
+ident + +personal + +family +
+dyer + +William + +Dyer +
+pb + +Frank + +Pabodie +
+lake + +Anderson + +Lake +
+roe + +Valentina + +Roerich +
+danforth + +Frank + +Danforth +
+ +

Site: locations where readings were taken.

+ + + + + +
+name + +lat + +long +
+DR-1 + +-49.85 + +-128.57 +
+DR-3 + +-47.15 + +-126.72 +
+MSK-4 + +-48.87 + +-123.4 +
+ +

Visited: when readings were taken at specific sites.

+ + + + + + + + + + +
+ident + +site + +dated +
+619 + +DR-1 + +1927-02-08 +
+622 + +DR-1 + +1927-02-10 +
+734 + +DR-3 + +1939-01-07 +
+735 + +DR-3 + +1930-01-12 +
+751 + +DR-3 + +1930-02-26 +
+752 + +DR-3 + +  +
+837 + +MSK-4 + +1932-01-14 +
+844 + +DR-1 + +1932-03-22 +
+
+

Survey: the actual readings.

+ + + + + + + + + + + + + + + + + + + + + + + +
+taken + +person + +quant + +reading +
+619 + +dyer + +rad + +9.82 +
+619 + +dyer + +sal + +0.13 +
+622 + +dyer + +rad + +7.8 +
+622 + +dyer + +sal + +0.09 +
+734 + +pb + +rad + +8.41 +
+734 + +lake + +sal + +0.05 +
+734 + +pb + +temp + +-21.5 +
+735 + +pb + +rad + +7.22 +
+735 + +  + +sal + +0.06 +
+735 + +  + +temp + +-26.0 +
+751 + +pb + +rad + +4.35 +
+751 + +pb + +temp + +-18.5 +
+751 + +lake + +sal + +0.1 +
+752 + +lake + +rad + +2.19 +
+752 + +lake + +sal + +0.09 +
+752 + +lake + +temp + +-16.0 +
+752 + +roe + +sal + +41.6 +
+837 + +lake + +rad + +1.46 +
+837 + +lake + +sal + +0.21 +
+837 + +roe + +sal + +22.5 +
+844 + +roe + +rad + +11.25 +
+
+ +
+ + +
+

Notice that three entries—one in the Visited table, and two in the Survey table—are shown in red because they don't contain any actual data: we'll return to these missing values later. For now, let's write an SQL query that displays scientists' names. We do this using the SQL command select, giving it the names of the columns we want and the table we want them from. Our query and its output look like this:

+
+ + +
+
%load_ext sqlitemagic
+
+ + +
+
%%sqlite survey.db
+select family, personal from Person;
+
+ +
+

+
+
+
+
+
+
DyerWilliam
PabodieFrank
LakeAnderson
RoerichValentina
DanforthFrank
+
+ + +
+

The semi-colon at the end of the query tells the database manager that the query is complete and ready to run. We have written our commands and column names in lower case, and the table name in Title Case, but we don't have to: as the example below shows, SQL is case insensitive.

+
+ + +
+
%%sqlite survey.db
+SeLeCt FaMiLy, PeRsOnAl FrOm PeRsOn;
+
+ +
+

+
+
+
+
+
+
DyerWilliam
PabodieFrank
LakeAnderson
RoerichValentina
DanforthFrank
+
+ + +
+

Whatever casing convention you choose, please be consistent: complex queries are hard enough to read without the extra cognitive load of random capitalization.

+
+ + +
+

Going back to our query, it's important to understand that the rows and columns in a database table aren't actually stored in any particular order. They will always be displayed in some order, but we can control that in various ways. For example, we could swap the columns in the output by writing our query as:

+
+ + +
+
%%sqlite survey.db
+select personal, family from Person;
+
+ +
+

+
+
+
+
+
+
WilliamDyer
FrankPabodie
AndersonLake
ValentinaRoerich
FrankDanforth
+
+ + +
+

or even repeat columns:

+
+ + +
+
%%sqlite survey.db
+select ident, ident, ident from Person;
+
+ +
+

+
+
+
+
+
+
dyerdyerdyer
pbpbpb
lakelakelake
roeroeroe
danforthdanforthdanforth
+
+ + +
+

As a shortcut, we can select all of the columns in a table using *:

+
+ + +
+
%%sqlite survey.db
+select * from Person;
+
+ +
+

+
+
+
+
+
+
dyerWilliamDyer
pbFrankPabodie
lakeAndersonLake
roeValentinaRoerich
danforthFrankDanforth
+
+ + +
+

Challenges

+
    +
  1. Write a query that selects only site names from the Site table.

  2. +
  3. Many people format queries as:

    +
    SELECT personal, family FROM person;
    +

    or as:

    +
    select Personal, Family from PERSON;
    +

    What style do you find easiest to read, and why?

  4. +
+
+ + +
+

Key Points

+
    +
  • A relational database stores information in tables, each of which has a fixed set of columns and a variable number of records.
  • +
  • A database manager is a program that manipulates information stored in a database.
  • +
  • We write queries in a specialized language called SQL to extract information from databases.
  • +
  • SQL is case-insensitive.
  • +
+
diff --git a/cached/novice/sql/02-sort-dup.md b/cached/novice/sql/02-sort-dup.md new file mode 100644 index 000000000..d427cc2ae --- /dev/null +++ b/cached/novice/sql/02-sort-dup.md @@ -0,0 +1,251 @@ +--- +layout: lesson +root: ../.. +--- + +## Sorting and Removing Duplicates + + +
+

Objectives

+
    +
  • Write queries that display results in a particular order.
  • +
  • Write queries that eliminate duplicate values from data.
  • +
+
+ + +
+

Data is often redundant, so queries often return redundant information. For example, if we select the quantitites that have been measured from the survey table, we get this:

+
+ + +
+
%load_ext sqlitemagic
+
+ + +
+
%%sqlite survey.db
+select quant from Survey;
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
rad
sal
rad
sal
rad
sal
temp
rad
sal
temp
rad
temp
sal
rad
sal
temp
sal
rad
sal
sal
rad
+
+ + +
+

We can eliminate the redundant output to make the result more readable by adding the distinct keyword to our query:

+
+ + +
+
%%sqlite survey.db
+select distinct quant from Survey;
+
+ +
+

+
+
+
+
rad
sal
temp
+
+ + +
+

If we select more than one column—for example, both the survey site ID and the quantity measured—then the distinct pairs of values are returned:

+
+ + +
+
%%sqlite survey.db
+select distinct taken, quant from Survey;
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
619rad
619sal
622rad
622sal
734rad
734sal
734temp
735rad
735sal
735temp
751rad
751temp
751sal
752rad
752sal
752temp
837rad
837sal
844rad
+
+ + +
+

Notice in both cases that duplicates are removed even if they didn't appear to be adjacent in the database. Again, it's important to remember that rows aren't actually ordered: they're just displayed that way.

+
+ + +
+

Challenges

+
    +
  1. Write a query that selects distinct dates from the Site table.
  2. +
+
+ + +
+

As we mentioned earlier, database records are not stored in any particular order. This means that query results aren't necessarily sorted, and even if they are, we often want to sort them in a different way, e.g., by the name of the project instead of by the name of the scientist. We can do this in SQL by adding an order by clause to our query:

+
+ + +
+
%%sqlite survey.db
+select * from Person order by ident;
+
+ +
+

+
+
+
+
+
+
danforthFrankDanforth
dyerWilliamDyer
lakeAndersonLake
pbFrankPabodie
roeValentinaRoerich
+
+ + +
+

By default, results are sorted in ascending order (i.e., from least to greatest). We can sort in the opposite order using desc (for "descending"):

+
+ + +
+
%%sqlite survey.db
+select * from person order by ident desc;
+
+ +
+

+
+
+
+
+
+
roeValentinaRoerich
pbFrankPabodie
lakeAndersonLake
dyerWilliamDyer
danforthFrankDanforth
+
+ + +
+

(And if we want to make it clear that we're sorting in ascending order, we can use asc instead of desc.)

+

We can also sort on several fields at once. For example, this query sorts results first in ascending order by taken, and then in descending order by person within each group of equal taken values:

+
+ + +
+
%%sqlite survey.db
+select taken, person from Survey order by taken asc, person desc;
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
619dyer
619dyer
622dyer
622dyer
734pb
734pb
734lake
735pb
735None
735None
751pb
751pb
751lake
752roe
752lake
752lake
752lake
837roe
837lake
837lake
844roe
+
+ + +
+

This is easier to understand if we also remove duplicates:

+
+ + +
+
%%sqlite survey.db
+select distinct taken, person from Survey order by taken asc, person desc;
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
619dyer
622dyer
734pb
734lake
735pb
735None
751pb
751lake
752roe
752lake
837roe
837lake
844roe
+
+ + +
+

Challenges

+
    +
  1. Write a query that returns the distinct dates in the Visited table.

  2. +
  3. Write a query that displays the full names of the scientists in the Person table, ordered by family name.

  4. +
+
+ + +
+

Key Points

+
    +
  • The records in a database table are not intrinsically ordered: if we want to display them in some order, we must specify that explicitly.
  • +
  • The values in a database are not guaranteed to be unique: if we want to eliminate duplicates, we must specify that explicitly as well.
  • +
+
diff --git a/cached/novice/sql/03-filter.md b/cached/novice/sql/03-filter.md new file mode 100644 index 000000000..6507be1ae --- /dev/null +++ b/cached/novice/sql/03-filter.md @@ -0,0 +1,274 @@ +--- +layout: lesson +root: ../.. +--- + +## Filtering + + +
+

Objectives

+
    +
  • Write queries that select records that satisfy user-specified conditions.
  • +
  • Explain the order in which the clauses in a query are executed.
  • +
+
+ + +
+

One of the most powerful features of a database is the ability to filter data, i.e., to select only those records that match certain criteria. For example, suppose we want to see when a particular site was visited. We can select these records from the Visited table by using a where clause in our query:

+
+ + +
+
%load_ext sqlitemagic
+
+ + +
+
%%sqlite survey.db
+select * from Visited where site='DR-1';
+
+ +
+

+
+
+
+
619DR-11927-02-08
622DR-11927-02-10
844DR-11932-03-22
+
+ + +
+

The database manager executes this query in two stages. First, it checks at each row in the Visited table to see which ones satisfy the where. It then uses the column names following the select keyword to determine what columns to display.

+
+ + +
+

This processing order means that we can filter records using where based on values in columns that aren't then displayed:

+
+ + +
+
%%sqlite survey.db
+select ident from Visited where site='DR-1';
+
+ +
+

+
+
+
+
619
622
844
+
+ + +
+

SQL Filtering in Action

+
+ + +
+

We can use many other Boolean operators to filter our data. For example, we can ask for all information from the DR-1 site collected since 1930:

+
+ + +
+
%%sqlite survey.db
+select * from Visited where (site='DR-1') and (dated>='1930-00-00');
+
+ +
+

+
+
844DR-11932-03-22
+
+ + +
+

(The parentheses around the individual tests aren't strictly required, but they help make the query easier to read.)

+
+

Most database managers have a special data type for dates. In fact, many have two: one for dates, such as "May 31, 1971", and one for durations, such as "31 days". SQLite doesn't: instead, it stores dates as either text (in the ISO-8601 standard format "YYYY-MM-DD HH:MM:SS.SSSS"), real numbers (the number of days since November 24, 4714 BCE), or integers (the number of seconds since midnight, January 1, 1970). If this sounds complicated, it is, but not nearly as complicated as figuring out historical dates in Sweden.

+
+
+ + +
+

If we want to find out what measurements were taken by either Lake or Roerich, we can combine the tests on their names using or:

+
+ + +
+
%%sqlite survey.db
+select * from Survey where person='lake' or person='roe';
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
734lakesal0.05
751lakesal0.1
752lakerad2.19
752lakesal0.09
752laketemp-16.0
752roesal41.6
837lakerad1.46
837lakesal0.21
837roesal22.5
844roerad11.25
+
+ + +
+

Alternatively, we can use in to see if a value is in a specific set:

+
+ + +
+
%%sqlite survey.db
+select * from Survey where person in ('lake', 'roe');
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
734lakesal0.05
751lakesal0.1
752lakerad2.19
752lakesal0.09
752laketemp-16.0
752roesal41.6
837lakerad1.46
837lakesal0.21
837roesal22.5
844roerad11.25
+
+ + +
+

We can combine and with or, but we need to be careful about which operator is executed first. If we don't use parentheses, we get this:

+
+ + +
+
%%sqlite survey.db
+select * from Survey where quant='sal' and person='lake' or person='roe';
+
+ +
+

+
+
+
+
+
+
+
+
734lakesal0.05
751lakesal0.1
752lakesal0.09
752roesal41.6
837lakesal0.21
837roesal22.5
844roerad11.25
+
+ + +
+

which is salinity measurements by Lake, and any measurement by Roerich. We probably want this instead:

+
+ + +
+
%%sqlite survey.db
+select * from Survey where quant='sal' and (person='lake' or person='roe');
+
+ +
+

+
+
+
+
+
+
+
734lakesal0.05
751lakesal0.1
752lakesal0.09
752roesal41.6
837lakesal0.21
837roesal22.5
+
+ + +
+

Finally, we can use distinct with where to give a second level of filtering:

+
+ + +
+
%%sqlite survey.db
+select distinct person, quant from Survey where person='lake' or person='roe';
+
+ +
+

+
+
+
+
+
+
lakesal
lakerad
laketemp
roesal
roerad
+
+ + +
+

But remember: distinct is applied to the values displayed in the chosen columns, not to the entire rows as they are being processed.

+
+

What we have just done is how most people "grow" their SQL queries. We started with something simple that did part of what we wanted, then added more clauses one by one, testing their effects as we went. This is a good strategy—in fact, for complex queries it's often the only strategy—but it depends on quick turnaround, and on us recognizing the right answer when we get it.

+

The best way to achieve quick turnaround is often to put a subset of data in a temporary database and run our queries against that, or to fill a small database with synthesized records. For example, instead of trying our queries against an actual database of 20 million Australians, we could run it against a sample of ten thousand, or write a small program to generate ten thousand random (but plausible) records and use that.

+
+
+ + +
+

Challenges

+
    +
  1. Suppose we want to select all sites that lie more than 30° from the poles. Our first query is:

    +
    select * from Site where (lat > -60) or (lat < 60);
    +

    Explain why this is wrong, and rewrite the query so that it is correct.

  2. +
  3. Normalized salinity readings are supposed to be between 0.0 and 1.0. Write a query that selects all records from Survey with salinity values outside this range.

  4. +
  5. The SQL test *column-name* like *pattern* is true if the value in the named column matches the pattern given; the character '%' can be used any number of times in the pattern to mean "match zero or more characters".

    + + + + + + + +
    +Expression + +Value +
    +'a' like 'a' + +True +
    +'a' like '%a' + +True +
    +'b' like '%a' + +False +
    +'alpha' like 'a%' + +True +
    +'alpha' like 'a%p%' + +True +
    +

    The expression *column-name* not like *pattern* inverts the test. Using like, write a query that finds all the records in Visited that aren't from sites labelled 'DR-something'.

  6. +
+
+ + +
+

Key Points

+
    +
  • Use where to filter records according to Boolean conditions.
  • +
  • Filtering is done on whole records, so conditions can use fields that are not actually displayed.
  • +
+
diff --git a/cached/novice/sql/04-calc.md b/cached/novice/sql/04-calc.md new file mode 100644 index 000000000..27ec229d9 --- /dev/null +++ b/cached/novice/sql/04-calc.md @@ -0,0 +1,272 @@ +--- +layout: lesson +root: ../.. +--- + +## Calculating New Values + + +
+

Objectives

+
    +
  • Write queries that calculate new values for each selected record.
  • +
+
+ + +
+

After carefully re-reading the expedition logs, we realize that the radiation measurements they report may need to be corrected upward by 5%. Rather than modifying the stored data, we can do this calculation on the fly as part of our query:

+
+ + +
+
%load_ext sqlitemagic
+
+ + +
+
%%sqlite survey.db
+select 1.05 * reading from Survey where quant='rad';
+
+ +
+

+
+
+
+
+
+
+
+
+
10.311
8.19
8.8305
7.581
4.5675
2.2995
1.533
11.8125
+
+ + +
+

When we run the query, the expression 1.05 * reading is evaluated for each row. Expressions can use any of the fields, all of usual arithmetic operators, and a variety of common functions. (Exactly which ones depends on which database manager is being used.) For example, we can convert temperature readings from Fahrenheit to Celsius and round to two decimal places:

+
+ + +
+
%%sqlite survey.db
+select taken, round(5*(reading-32)/9, 2) from Survey where quant='temp';
+
+ +
+

+
+
+
+
+
734-29.72
735-32.22
751-28.06
752-26.67
+
+ + +
+

We can also combine values from different fields, for example by using the string concatenation operator ||:

+
+ + +
+
%%sqlite survey.db
+select personal || ' ' || family from Person;
+
+ +
+

+
+
+
+
+
+
William Dyer
Frank Pabodie
Anderson Lake
Valentina Roerich
Frank Danforth
+
+ + +
+
+

It may seem strange to use personal and family as field names instead of first and last, but it's a necessary first step toward handling cultural differences. For example, consider the following rules:

+
+ + + + + + + + + + + +
+Full Name + +Alphabetized Under + +Reason +
+Liu Xiaobo + +Liu + +Chinese family names come first +
+Leonardo da Vinci + +Leonardo + +"da Vinci" just means "from Vinci" +
+Catherine de Medici + +Medici + +family name +
+Jean de La Fontaine + +La Fontaine + +family name is "La Fontaine" +
+Juan Ponce de Leon + +Ponce de Leon + +full family name is "Ponce de Leon" +
+Gabriel Garcia Marquez + +Garcia Marquez + +double-barrelled Spanish surnames +
+Wernher von Braun + +von or Braun + +depending on whether he was in Germany or the US +
+Elizabeth Alexandra May Windsor + +Elizabeth + +monarchs alphabetize by the name under which they reigned +
+Thomas a Beckett + +Thomas + +and saints according to the names by which they were canonized +
+ +
+

Clearly, even a two-part division into "personal" and "family" isn't enough...

+
+
+ + +
+

Challenges

+
    +
  1. After further reading, we realize that Valentina Roerich was reporting salinity as percentages. Write a query that returns all of her salinity measurements from the Survey table with the values divided by 100.

  2. +
  3. The union operator combines the results of two queries:

  4. +
+
+ + +
+
%%sqlite survey.db
+select * from Person where ident='dyer' union select * from Person where ident='roe';
+
+ +
+

+
+
+
dyerWilliamDyer
roeValentinaRoerich
+
+ + +
+

Use union to create a consolidated list of salinity measurements in which Roerich's, and only Roerich's, have been corrected as described in the previous challenge. The output should be something like:

+ + + + + + + + + +
+619 + +0.13 +
+622 + +0.09 +
+734 + +0.05 +
+751 + +0.1 +
+752 + +0.09 +
+752 + +0.416 +
+837 + +0.21 +
+837 + +0.225 +
+ + +
+ + +
+
    +
  1. The site identifiers in the Visited table have two parts separated by a '-':
  2. +
+
+ + +
+
%%sqlite survey.db
+select distinct site from Visited;
+
+ +
+

+
+
+
+
DR-1
DR-3
MSK-4
+
+ + +
+

Some major site identifiers are two letters long and some are three. The "in string" function instr(X, Y) returns the 1-based index of the first occurrence of string Y in string X, or 0 if Y does not exist in X. The substring function substr(X, I) returns the substring of X starting at index I. Use these two functions to produce a list of unique major site identifiers. (For this data, the list should contain only "DR" and "MSK").

+
+ + +
+

Key Points

+
    +
  • SQL can perform calculations using the values in a record as part of a query.
  • +
+
diff --git a/cached/novice/sql/05-null.md b/cached/novice/sql/05-null.md new file mode 100644 index 000000000..c537621db --- /dev/null +++ b/cached/novice/sql/05-null.md @@ -0,0 +1,223 @@ +--- +layout: lesson +root: ../.. +--- + +## Missing Data + + +
+

Objectives

+
    +
  • Explain how databases represent missing information.
  • +
  • Explain the three-valued logic databases use when manipulating missing information.
  • +
  • Write queries that handle missing information correctly.
  • +
+
+ + +
+

Real-world data is never complete—there are always holes. Databases represent these holes using special value called null. null is not zero, False, or the empty string; it is a one-of-a-kind value that means "nothing here". Dealing with null requires a few special tricks and some careful thinking.

+

To start, let's have a look at the Visited table. There are eight records, but #752 doesn't have a date—or rather, its date is null:

+
+ + +
+
%load_ext sqlitemagic
+
+ + +
+
%%sqlite survey.db
+select * from Visited;
+
+ +
+

+
+
+
+
+
+
+
+
+
619DR-11927-02-08
622DR-11927-02-10
734DR-31939-01-07
735DR-31930-01-12
751DR-31930-02-26
752DR-3None
837MSK-41932-01-14
844DR-11932-03-22
+
+ + +
+

Null doesn't behave like other values. If we select the records that come before 1930:

+
+ + +
+
%%sqlite survey.db
+select * from Visited where dated<'1930-00-00';
+
+ +
+

+
+
+
619DR-11927-02-08
622DR-11927-02-10
+
+ + +
+

we get two results, and if we select the ones that come during or after 1930:

+
+ + +
+
%%sqlite survey.db
+select * from Visited where dated>='1930-00-00';
+
+ +
+

+
+
+
+
+
+
734DR-31939-01-07
735DR-31930-01-12
751DR-31930-02-26
837MSK-41932-01-14
844DR-11932-03-22
+
+ + +
+

we get five, but record #752 isn't in either set of results. The reason is that null<'1930-00-00' is neither true nor false: null means, "We don't know," and if we don't know the value on the left side of a comparison, we don't know whether the comparison is true or false. Since databases represent "don't know" as null, the value of null<'1930-00-00' is actually null. null>='1930-00-00' is also null because we can't answer to that question either. And since the only records kept by a where are those for which the test is true, record #752 isn't included in either set of results.

+

Comparisons aren't the only operations that behave this way with nulls. 1+null is null, 5*null is null, log(null) is null, and so on. In particular, comparing things to null with = and != produces null:

+
+ + +
+
%%sqlite survey.db
+select * from Visited where dated=NULL;
+
+ +
+

+
+
+
+ + +
+
%%sqlite survey.db
+select * from Visited where dated!=NULL;
+
+ +
+

+
+
+
+ + +
+

To check whether a value is null or not, we must use a special test is null:

+
+ + +
+
%%sqlite survey.db
+select * from Visited where dated is NULL;
+
+ +
+

+
+
752DR-3None
+
+ + +
+

or its inverse is not null:

+
+ + +
+
%%sqlite survey.db
+select * from Visited where dated is not NULL;
+
+ +
+

+
+
+
+
+
+
+
+
619DR-11927-02-08
622DR-11927-02-10
734DR-31939-01-07
735DR-31930-01-12
751DR-31930-02-26
837MSK-41932-01-14
844DR-11932-03-22
+
+ + +
+

Null values cause headaches wherever they appear. For example, suppose we want to find all the salinity measurements that weren't taken by Dyer. It's natural to write the query like this:

+
+ + +
+
%%sqlite survey.db
+select * from Survey where quant='sal' and person!='lake';
+
+ +
+

+
+
+
+
+
619dyersal0.13
622dyersal0.09
752roesal41.6
837roesal22.5
+
+ + +
+

but this query filters omits the records where we don't know who took the measurement. Once again, the reason is that when person is null, the != comparison produces null, so the record isn't kept in our results. If we want to keep these records we need to add an explicit check:

+
+ + +
+
%%sqlite survey.db
+select * from Survey where quant='sal' and (person!='lake' or person is null);
+
+ +
+

+
+
+
+
+
+
619dyersal0.13
622dyersal0.09
735Nonesal0.06
752roesal41.6
837roesal22.5
+
+ + +
+

We still have to decide whether this is the right thing to do or not. If we want to be absolutely sure that we aren't including any measurements by Lake in our results, we need to exclude all the records for which we don't know who did the work.

+
+ + +
+

Challenges

+
    +
  1. Write a query that sorts the records in Visited by date, omitting entries for which the date is not known (i.e., is null).

  2. +
  3. What do you expect the query:

    +
    select * from Visited where dated in ('1927-02-08', null);
    +

    to produce? What does it actually produce?

  4. +
  5. Some database designers prefer to use a sentinel value to mark missing data rather than null. For example, they will use the date "0000-00-00" to mark a missing date, or -1.0 to mark a missing salinity or radiation reading (since actual readings cannot be negative). What does this simplify? What burdens or risks does it introduce?

  6. +
+
+ + +
+

Key Points

+
    +
  • Databases use null to represent missing information.
  • +
  • Any arithmetic or Boolean operation involving null produces null as a result.
  • +
  • The only operators that can safely be used with null are is null and is not null.
  • +
+
diff --git a/cached/novice/sql/06-agg.md b/cached/novice/sql/06-agg.md new file mode 100644 index 000000000..daa7a344d --- /dev/null +++ b/cached/novice/sql/06-agg.md @@ -0,0 +1,371 @@ +--- +layout: lesson +root: ../.. +--- + +## Aggregation + + +
+

Objectives

+
    +
  • Define "aggregation" and give examples of its use.
  • +
  • Write queries that compute aggregated values.
  • +
  • Trace the execution of a query that performs aggregation.
  • +
  • Explain how missing data is handled during aggregation.
  • +
+
+ + +
+

We now want to calculate ranges and averages for our data. We know how to select all of the dates from the Visited table:

+
+ + +
+
%load_ext sqlitemagic
+
+ + +
+
%%sqlite survey.db
+select dated from Visited;
+
+ +
+

+
+
+
+
+
+
+
+
+
1927-02-08
1927-02-10
1939-01-07
1930-01-12
1930-02-26
None
1932-01-14
1932-03-22
+
+ + +
+

but to combine them, wee must use an aggregation function such as min or max. Each of these functions takes a set of records as input, and produces a single record as output:

+
+ + +
+
%%sqlite survey.db
+select min(dated) from Visited;
+
+ +
+

+
+
1927-02-08
+
+ + +
+

SQL Aggregation

+
+ + +
+
%%sqlite survey.db
+select max(dated) from Visited;
+
+ +
+

+
+
1939-01-07
+
+ + +
+

min and max are just two of the aggregation functions built into SQL. Three others are avg, count, and sum:

+
+ + +
+
%%sqlite survey.db
+select avg(reading) from Survey where quant='sal';
+
+ +
+

+
+
7.20333333333
+
+ + +
+
%%sqlite survey.db
+select count(reading) from Survey where quant='sal';
+
+ +
+

+
+
9
+
+ + +
+
%%sqlite survey.db
+select sum(reading) from Survey where quant='sal';
+
+ +
+

+
+
64.83
+
+ + +
+

We used count(reading) here, but we could just as easily have counted quant or any other field in the table, or even used count(*), since the function doesn't care about the values themselves, just how many values there are.

+

SQL lets us do several aggregations at once. We can, for example, find the range of sensible salinity measurements:

+
+ + +
+
%%sqlite survey.db
+select min(reading), max(reading) from Survey where quant='sal' and reading<=1.0;
+
+ +
+

+
+
0.050.21
+
+ + +
+

We can also combine aggregated results with raw results, although the output might surprise you:

+
+ + +
+
%%sqlite survey.db
+select person, count(*) from Survey where quant='sal' and reading<=1.0;
+
+ +
+

+
+
lake7
+
+ + +
+

Why does Lake's name appear rather than Roerich's or Dyer's? The answer is that when it has to aggregate a field, but isn't told how to, the database manager chooses an actual value from the input set. It might use the first one processed, the last one, or something else entirely.

+

Another important fact is that when there are no values to aggregate, aggregation's result is "don't know" rather than zero or some other arbitrary value:

+
+ + +
+
%%sqlite survey.db
+select person, max(reading), sum(reading) from Survey where quant='missing';
+
+ +
+

+
+
NoneNoneNone
+
+ + +
+

One final important feature of aggregation functions is that they are inconsistent with the rest of SQL in a very useful way. If we add two values, and one of them is null, the result is null. By extension, if we use sum to add all the values in a set, and any of those values are null, the result should also be null. It's much more useful, though, for aggregation functions to ignore null values and only combine those that are non-null. This behavior lets us write our queries as:

+
+ + +
+
%%sqlite survey.db
+select min(dated) from Visited;
+
+ +
+

+
+
1927-02-08
+
+ + +
+

instead of always having to filter explicitly:

+
+ + +
+
%%sqlite survey.db
+select min(dated) from Visited where dated is not null;
+
+ +
+

+
+
1927-02-08
+
+ + +
+

Aggregating all records at once doesn't always make sense. For example, suppose Gina suspects that there is a systematic bias in her data, and that some scientists' radiation readings are higher than others. We know that this doesn't work:

+
+ + +
+
%%sqlite survey.db
+select person, count(reading), round(avg(reading), 2)
+from  Survey
+where quant='rad';
+
+ +
+

+
+
roe86.56
+
+ + +
+

because the database manager selects a single arbitrary scientist's name rather than aggregating separately for each scientist. Since there are only five scientists, she could write five queries of the form:

+
+ + +
+
%%sqlite survey.db
+select person, count(reading), round(avg(reading), 2)
+from  Survey
+where quant='rad'
+and   person='dyer';
+
+ +
+

+
+
dyer28.81
+
+ + +
+

but this would be tedious, and if she ever had a data set with fifty or five hundred scientists, the chances of her getting all of those queries right is small.

+

What we need to do is tell the database manager to aggregate the hours for each scientist separately using a group by clause:

+
+ + +
+
%%sqlite survey.db
+select   person, count(reading), round(avg(reading), 2)
+from     Survey
+where    quant='rad'
+group by person;
+
+ +
+

+
+
+
+
+
dyer28.81
lake21.82
pb36.66
roe111.25
+
+ + +
+

group by does exactly what its name implies: groups all the records with the same value for the specified field together so that aggregation can process each batch separately. Since all the records in each batch have the same value for person, it no longer matters that the database manager is picking an arbitrary one to display alongside the aggregated reading values.

+
+ + +
+

Just as we can sort by multiple criteria at once, we can also group by multiple criteria. To get the average reading by scientist and quantity measured, for example, we just add another field to the group by clause:

+
+ + +
+
%%sqlite survey.db
+select   person, quant, count(reading), round(avg(reading), 2)
+from     Survey
+group by person, quant;
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
+
Nonesal10.06
Nonetemp1-26.0
dyerrad28.81
dyersal20.11
lakerad21.82
lakesal40.11
laketemp1-16.0
pbrad36.66
pbtemp2-20.0
roerad111.25
roesal232.05
+
+ + +
+

Note that we have added person to the list of fields displayed, since the results wouldn't make much sense otherwise.

+

Let's go one step further and remove all the entries where we don't know who took the measurement:

+
+ + +
+
%%sqlite survey.db
+select   person, quant, count(reading), round(avg(reading), 2)
+from     Survey
+where    person is not null
+group by person, quant
+order by person, quant;
+
+ +
+

+
+
+
+
+
+
+
+
+
+
dyerrad28.81
dyersal20.11
lakerad21.82
lakesal40.11
laketemp1-16.0
pbrad36.66
pbtemp2-20.0
roerad111.25
roesal232.05
+
+ + +
+

Looking more closely, this query:

+
    +
  1. selected records from the Survey table where the person field was not null;

  2. +
  3. grouped those records into subsets so that the person and quant values in each subset were the same;

  4. +
  5. ordered those subsets first by person, and then within each sub-group by quant; and

  6. +
  7. counted the number of records in each subset, calculated the average reading in each, and chose a person and quant value from each (it doesn't matter which ones, since they're all equal).

  8. +
+
+ + +
+

Challenges

+
    +
  1. How many temperature readings did Frank Pabodie record, and what was their average value?

  2. +
  3. The average of a set of values is the sum of the values divided by the number of values. Does this mean that the avg function returns 2.0 or 3.0 when given the values 1.0, null, and 5.0?

  4. +
  5. We want to calculate the difference between each individual radiation reading and the average of all the radiation readings. We write the query:

    +
    select reading - avg(reading) from Survey where quant='rad';
    +

    What does this actually produce, and why?

  6. +
  7. The function group_concat(field, separator) concatenates all the values in a field using the specified separator character (or ',' if the separator isn't specified). Use this to produce a one-line list of scientists' names, such as:

    +
    William Dyer, Frank Pabodie, Anderson Lake, Valentina Roerich, Frank Danforth
    +

    Can you find a way to order the list by surname?

  8. +
+
+ + +
+

Key Points

+
    +
  • An aggregation function combines many values to produce a single new value.
  • +
  • Aggregation functions ignore null values.
  • +
  • Aggregation happens after filtering.
  • +
+
diff --git a/cached/novice/sql/07-join.md b/cached/novice/sql/07-join.md new file mode 100644 index 000000000..5a2e0793d --- /dev/null +++ b/cached/novice/sql/07-join.md @@ -0,0 +1,288 @@ +--- +layout: lesson +root: ../.. +--- + +## Combining Data + + +
+

Objectives

+
    +
  • Explain the operation of a query that joins two tables.
  • +
  • Explain how to restrict the output of a query containing a join to only include meaningful combinations of values.
  • +
  • Write queries that join tables on equal keys.
  • +
  • Explain what primary and foreign keys are, and why they are useful.
  • +
  • Explain what atomic values are, and why database fields should only contain atomic values.
  • +
+
+ + +
+

In order to submit her data to a web site that aggregates historical meteorological data, Gina needs to format it as latitude, longitude, date, quantity, and reading. However, her latitudes and longitudes are in the Site table, while the dates of measurements are in the Visited table and the readings themselves are in the Survey table. She needs to combine these tables somehow.

+

The SQL command to do this is join. To see how it works, let's start by joining the Site and Visited tables:

+
+ + +
+
%load_ext sqlitemagic
+
+ + +
+
%%sqlite survey.db
+select * from Site join Visited;
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
DR-1-49.85-128.57619DR-11927-02-08
DR-1-49.85-128.57622DR-11927-02-10
DR-1-49.85-128.57734DR-31939-01-07
DR-1-49.85-128.57735DR-31930-01-12
DR-1-49.85-128.57751DR-31930-02-26
DR-1-49.85-128.57752DR-3None
DR-1-49.85-128.57837MSK-41932-01-14
DR-1-49.85-128.57844DR-11932-03-22
DR-3-47.15-126.72619DR-11927-02-08
DR-3-47.15-126.72622DR-11927-02-10
DR-3-47.15-126.72734DR-31939-01-07
DR-3-47.15-126.72735DR-31930-01-12
DR-3-47.15-126.72751DR-31930-02-26
DR-3-47.15-126.72752DR-3None
DR-3-47.15-126.72837MSK-41932-01-14
DR-3-47.15-126.72844DR-11932-03-22
MSK-4-48.87-123.4619DR-11927-02-08
MSK-4-48.87-123.4622DR-11927-02-10
MSK-4-48.87-123.4734DR-31939-01-07
MSK-4-48.87-123.4735DR-31930-01-12
MSK-4-48.87-123.4751DR-31930-02-26
MSK-4-48.87-123.4752DR-3None
MSK-4-48.87-123.4837MSK-41932-01-14
MSK-4-48.87-123.4844DR-11932-03-22
+
+ + +
+

join creates the cross product of two tables, i.e., it joins each record of one with each record of the other to give all possible combinations. Since there are three records in Site and eight in Visited, the join's output has 24 records. And since each table has three fields, the output has six fields.

+

What the join hasn't done is figure out if the records being joined have anything to do with each other. It has no way of knowing whether they do or not until we tell it how. To do that, we add a clause specifying that we're only interested in combinations that have the same site name:

+
+ + +
+
%%sqlite survey.db
+select * from Site join Visited on Site.name=Visited.site;
+
+ +
+

+
+
+
+
+
+
+
+
+
DR-1-49.85-128.57619DR-11927-02-08
DR-1-49.85-128.57622DR-11927-02-10
DR-1-49.85-128.57844DR-11932-03-22
DR-3-47.15-126.72734DR-31939-01-07
DR-3-47.15-126.72735DR-31930-01-12
DR-3-47.15-126.72751DR-31930-02-26
DR-3-47.15-126.72752DR-3None
MSK-4-48.87-123.4837MSK-41932-01-14
+
+ + +
+

on does the same job as where: it only keeps records that pass some test. (The difference between the two is that on filters records as they're being created, while where waits until the join is done and then does the filtering.) Once we add this to our query, the database manager throws away records that combined information about two different sites, leaving us with just the ones we want.

+

Notice that we used table.field to specify field names in the output of the join. We do this because tables can have fields with the same name, and we need to be specific which ones we're talking about. For example, if we joined the person and visited tables, the result would inherit a field called ident from each of the original tables.

+

We can now use the same dotted notation to select the three columns we actually want out of our join:

+
+ + +
+
%%sqlite survey.db
+select Site.lat, Site.long, Visited.dated
+from   Site join Visited
+on     Site.name=Visited.site;
+
+ +
+

+
+
+
+
+
+
+
+
+
-49.85-128.571927-02-08
-49.85-128.571927-02-10
-49.85-128.571932-03-22
-47.15-126.72None
-47.15-126.721930-01-12
-47.15-126.721930-02-26
-47.15-126.721939-01-07
-48.87-123.41932-01-14
+
+ + +
+

If joining two tables is good, joining many tables must be better. In fact, we can join any number of tables simply by adding more join clauses to our query, and more on tests to filter out combinations of records that don't make sense:

+
+ + +
+
%%sqlite survey.db
+select Site.lat, Site.long, Visited.dated, Survey.quant, Survey.reading
+from   Site join Visited join Survey
+on     Site.name=Visited.site
+and    Visited.ident=Survey.taken
+and    Visited.dated is not null;
+
+ +
+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-49.85-128.571927-02-08rad9.82
-49.85-128.571927-02-08sal0.13
-49.85-128.571927-02-10rad7.8
-49.85-128.571927-02-10sal0.09
-47.15-126.721939-01-07rad8.41
-47.15-126.721939-01-07sal0.05
-47.15-126.721939-01-07temp-21.5
-47.15-126.721930-01-12rad7.22
-47.15-126.721930-01-12sal0.06
-47.15-126.721930-01-12temp-26.0
-47.15-126.721930-02-26rad4.35
-47.15-126.721930-02-26sal0.1
-47.15-126.721930-02-26temp-18.5
-48.87-123.41932-01-14rad1.46
-48.87-123.41932-01-14sal0.21
-48.87-123.41932-01-14sal22.5
-49.85-128.571932-03-22rad11.25
+
+ + +
+

We can tell which records from Site, Visited, and Survey correspond with each other because those tables contain primary keys and foreign keys. A primary key is a value, or combination of values, that uniquely identifies each record in a table. A foreign key is a value (or combination of values) from one table that identifies a unique record in another table. Another way of saying this is that a foreign key is the primary key of one table that appears in some other table. In our database, Person.ident is the primary key in the Person table, while Survey.person is a foreign key relating the Survey table's entries to entries in Person.

+

Most database designers believe that every table should have a well-defined primary key. They also believe that this key should be separate from the data itself, so that if we ever need to change the data, we only need to make one change in one place. One easy way to do this is to create an arbitrary, unique ID for each record as we add it to the database. This is actually very common: those IDs have names like "student numbers" and "patient numbers", and they almost always turn out to have originally been a unique record identifier in some database system or other. As the query below demonstrates, SQLite automatically numbers records as they're added to tables, and we can use those record numbers in queries:

+
+ + +
+
%%sqlite survey.db
+select rowid, * from Person;
+
+ +
+

+
+
+
+
+
+
1dyerWilliamDyer
2pbFrankPabodie
3lakeAndersonLake
4roeValentinaRoerich
5danforthFrankDanforth
+
+ +### Data Hygiene + + +
+

Now that we have seen how joins work, we can see why the relational model is so useful and how best to use it. The first rule is that every value should be atomic, i.e., not contain parts that we might want to work with separately. We store personal and family names in separate columns instead of putting the entire name in one column so that we don't have to use substring operations to get the name's components. More importantly, we store the two parts of the name separately because splitting on spaces is unreliable: just think of a name like "Eloise St. Cyr" or "Jan Mikkel Steubart".

+

The second rule is that every record should have a unique primary key. This can be a serial number that has no intrinsic meaning, one of the values in the record (like the ident field in the Person table), or even a combination of values: the triple (taken, person, quant) from the Survey table uniquely identifies every measurement.

+

The third rule is that there should be no redundant information. For example, we could get rid of the Site table and rewrite the Visited table like this:

+ + + + + + + + + +
+619 + +-49.85 + +-128.57 + +1927-02-08 +
+622 + +-49.85 + +-128.57 + +1927-02-10 +
+734 + +-47.15 + +-126.72 + +1939-01-07 +
+735 + +-47.15 + +-126.72 + +1930-01-12 +
+751 + +-47.15 + +-126.72 + +1930-02-26 +
+752 + +-47.15 + +-126.72 + +null +
+837 + +-48.87 + +-123.40 + +1932-01-14 +
+844 + +-49.85 + +-128.57 + +1932-03-22 +
+ +

In fact, we could use a single table that recorded all the information about each reading in each row, just as a spreadsheet would. The problem is that it's very hard to keep data organized this way consistent: if we realize that the date of a particular visit to a particular site is wrong, we have to change multiple records in the database. What's worse, we may have to guess which records to change, since other sites may also have been visited on that date.

+

The fourth rule is that the units for every value should be stored explicitly. Our database doesn't do this, and that's a problem: Roerich's salinity measurements are several orders of magnitude larger than anyone else's, but we don't know if that means she was using parts per million instead of parts per thousand, or whether there actually was a saline anomaly at that site in 1932.

+

Stepping back, data and the tools used to store it have a symbiotic relationship: we use tables and joins because it's efficient, provided our data is organized a certain way, but organize our data that way because we have tools to manipulate it efficiently if it's in a certain form. As anthropologists say, the tool shapes the hand that shapes the tool.

+
+ + +
+

Challenges

+
    +
  1. Write a query that lists all radiation readings from the DR-1 site.

  2. +
  3. Write a query that lists all sites visited by people named "Frank".

  4. +
  5. Describe in your own words what the following query produces:

    +
    select Site.name from Site join Visited
    +on Site.lat<-49.0 and Site.name=Visited.site and Visited.dated>='1932-00-00';
  6. +
+
+ + +
+

Key Points

+
    +
  • Every fact should be represented in a database exactly once.
  • +
  • A join produces all combinations of records from one table with records from another.
  • +
  • A primary key is a field (or set of fields) whose values uniquely identify the records in a table.
  • +
  • A foreign key is a field (or set of fields) in one table whose values are a primary key in another table.
  • +
  • We can eliminate meaningless combinations of records by matching primary keys and foreign keys between tables.
  • +
  • Keys should be atomic values to make joins simpler and more efficient.
  • +
+
diff --git a/cached/novice/sql/08-create.md b/cached/novice/sql/08-create.md new file mode 100644 index 000000000..d88aed706 --- /dev/null +++ b/cached/novice/sql/08-create.md @@ -0,0 +1,102 @@ +--- +layout: lesson +root: ../.. +--- + +## Creating and Modifying Data + + +
+

Objectives

+
    +
  • Write queries that creates tables.
  • +
  • Write queries to insert, modify, and delete records.
  • +
+
+ + +
+

So far we have only looked at how to get information out of a database, both because that is more frequent than adding information, and because most other operations only make sense once queries are understood. If we want to create and modify data, we need to know two other pairs of commands.

+

The first pair are create table and drop table. While they are written as two words, they are actually single commands. The first one creates a new table; its arguments are the names and types of the table's columns. For example, the following statements create the four tables in our survey database:

+
create table Person(ident text, personal text, family text);
+create table Site(name text, lat real, long real);
+create table Visited(ident integer, site text, dated text);
+create table Survey(taken integer, person text, quant real, reading real);
+

We can get rid of one of our tables using:

+
drop table Survey;
+

Be very careful when doing this: most databases have some support for undoing changes, but it's better not to have to rely on it.

+

Different database systems support different data types for table columns, but most provide the following:

+ + + + + +
+integer + +a signed integer +
+real + +a floating point number +
+text + +a character string +
+blob + +a "binary large object", such as an image +
+ +

Most databases also support Booleans and date/time values; SQLite uses the integers 0 and 1 for the former, and represents the latter as discussed earlier. An increasing number of databases also support geographic data types, such as latitude and longitude. Keeping track of what particular systems do or do not offer, and what names they give different data types, is an unending portability headache.

+

When we create a table, we can specify several kinds of constraints on its columns. For example, a better definition for the Survey table would be:

+
create table Survey(
+    taken   integer not null, -- where reading taken
+    person  text,             -- may not know who took it
+    quant   real not null,    -- the quantity measured
+    reading real not null,    -- the actual reading
+    primary key(taken, quant),
+    foreign key(taken) references Visited(ident),
+    foreign key(person) references Person(ident)
+);
+

Once again, exactly what constraints are avialable and what they're called depends on which database manager we are using.

+

Once tables have been created, we can add and remove records using our other pair of commands, insert and delete. The simplest form of insert statement lists values in order:

+
insert into Site values('DR-1', -49.85, -128.57);
+insert into Site values('DR-3', -47.15, -126.72);
+insert into Site values('MSK-4', -48.87, -123.40);
+

We can also insert values into one table directly from another:

+
create table JustLatLong(lat text, long text);
+insert into JustLatLong select lat, long from site;
+

Deleting records can be a bit trickier, because we have to ensure that the database remains internally consistent. If all we care about is a single table, we can use the delete command with a where clause that matches the records we want to discard. For example, once we realize that Frank Danforth didn't take any measurements, we can remove him from the Person table like this:

+
delete from Person where ident = "danforth";
+

But what if we removed Anderson Lake instead? Our Survey table would still contain seven records of measurements he'd taken, but that's never supposed to happen: Survey.person is a foreign key into the Person table, and all our queries assume there will be a row in the latter matching every value in the former.

+

This problem is called referential integrity: we need to ensure that all references between tables can always be resolved correctly. One way to do this is to delete all the records that use 'lake' as a foreign key before deleting the record that uses it as a primary key. If our database manager supports it, we can automate this using cascading delete. However, this technique is outside the scope of this chapter.

+
+

Many applications use a hybrid storage model instead of putting everything into a database: the actual data (such as astronomical images) is stored in files, while the database stores the files' names, their modification dates, the region of the sky they cover, their spectral characteristics, and so on. This is also how most music player software is built: the database inside the application keeps track of the MP3 files, but the files themselves live on disk.

+
+
+ + +
+

Challenges

+
    +
  1. Write an SQL statement to replace all uses of null in Survey.person with the string 'unknown'.

  2. +
  3. One of our colleagues has sent us a CSV file containing temperature readings by Robert Olmstead, which is formatted like this:

    +
    Taken,Temp
    +619,-21.5
    +622,-15.5
    +

    Write a small Python program that reads this file in and prints out the SQL insert statements needed to add these records to the survey database. Note: you will need to add an entry for Olmstead to the Person table. If you are testing your program repeatedly, you may want to investigate SQL's insert or replace command.

  4. +
  5. SQLite has several administrative commands that aren't part of the SQL standard. One of them is .dump, which prints the SQL commands needed to re-create the database. Another is .load, which reads a file created by .dump and restores the database. A colleague of yours thinks that storing dump files (which are text) in version control is a good way to track and manage changes to the database. What are the pros and cons of this approach? (Hint: records aren't stored in any particular order.)

  6. +
+
+ + +
+

Key Points

+
    +
  • Database tables are created using queries that specify their names and the names and properties of their fields.
  • +
  • Records can be inserted, updated, or deleted using queries.
  • +
  • It is simpler and safer to modify data when every record has a unique primary key.
  • +
+
diff --git a/cached/novice/sql/09-prog.md b/cached/novice/sql/09-prog.md new file mode 100644 index 000000000..3155625a8 --- /dev/null +++ b/cached/novice/sql/09-prog.md @@ -0,0 +1,136 @@ +--- +layout: lesson +root: ../.. +--- + +## Programming with Databases + + +
+

Objectives

+
    +
  • Write short programs that execute SQL queries.
  • +
  • Trace the execution of a program that contains an SQL query.
  • +
  • Explain why most database applications are written in a general-purpose language rather than in SQL.
  • +
+
+ + +
+

To close, let's have a look at how to access a database from a general-purpose programming language like Python. Other languages use almost exactly the same model: library and function names may differ, but the concepts are the same.

+

Here's a short Python program that selects latitudes and longitudes from an SQLite database stored in a file called survey.db:

+
+ + +
+
import sqlite3
+connection = sqlite3.connect("survey.db")
+cursor = connection.cursor()
+cursor.execute("select site.lat, site.long from site;")
+results = cursor.fetchall()
+for r in results:
+    print r
+cursor.close()
+connection.close()
+
+ +
+
(-49.85, -128.57)
+(-47.15, -126.72)
+(-48.87, -123.4)
+
+
+ + +
+

The program starts by importing the sqlite3 library. If we were connecting to MySQL, DB2, or some other database, we would import a different library, but all of them provide the same functions, so that the rest of our program does not have to change (at least, not much) if we switch from one database to another.

+

Line 2 establishes a connection to the database. Since we're using SQLite, all we need to specify is the name of the database file. Other systems may require us to provide a username and password as well. Line 3 then uses this connection to create a cursor; just like the cursor in an editor, its role is to keep track of where we are in the database.

+

On line 4, we use that cursor to ask the database to execute a query for us. The query is written in SQL, and passed to cursor.execute as a string. It's our job to make sure that SQL is properly formatted; if it isn't, or if something goes wrong when it is being executed, the database will report an error.

+

The database returns the results of the query to us in response to the cursor.fetchall call on line 5. This result is a list with one entry for each record in the result set; if we loop over that list (line 6) and print those list entries (line 7), we can see that each one is a tuple with one element for each field we asked for.

+

Finally, lines 8 and 9 close our cursor and our connection, since the database can only keep a limited number of these open at one time. Since establishing a connection takes time, though, we shouldn't open a connection, do one operation, then close the connection, only to reopen it a few microseconds later to do another operation. Instead, it's normal to create one connection that stays open for the lifetime of the program.

+
+ + +
+

Queries in real applications will often depend on values provided by users. For example, this function takes a user's ID as a parameter and returns their name:

+
+ + +
+
def get_name(database_file, person_ident):
+    query = "select personal || ' ' || family from Person where ident='" + person_ident + "';"
+
+    connection = sqlite3.connect(database_file)
+    cursor = connection.cursor()
+    cursor.execute(query)
+    results = cursor.fetchall()
+    cursor.close()
+    connection.close()
+
+    return results[0][0]
+
+print "full name for dyer:", get_name('survey.db', 'dyer')
+
+ +
+
full name for dyer: William Dyer
+
+
+ + +
+

We use string concatenation on the first line of this function to construct a query containing the user ID we have been given. This seems simple enough, but what happens if someone gives us this string as input?

+
dyer'; drop table Survey; select '
+

It looks like there's garbage after the name of the project, but it is very carefully chosen garbage. If we insert this string into our query, the result is:

+
select personal || ' ' || family from Person where ident='dyer'; drop tale Survey; select '';
+

If we execute this, it will erase one of the tables in our database.

+

This is called an SQL injection attack, and it has been used to attack thousands of programs over the years. In particular, many web sites that take data from users insert values directly into queries without checking them carefully first.

+

Since a villain might try to smuggle commands into our queries in many different ways, the safest way to deal with this threat is to replace characters like quotes with their escaped equivalents, so that we can safely put whatever the user gives us inside a string. We can do this by using a prepared statement instead of formatting our statements as strings. Here's what our example program looks like if we do this:

+
+ + +
+
def get_name(database_file, person_ident):
+    query = "select personal || ' ' || family from Person where ident=?;"
+
+    connection = sqlite3.connect(database_file)
+    cursor = connection.cursor()
+    cursor.execute(query, [person_ident])
+    results = cursor.fetchall()
+    cursor.close()
+    connection.close()
+
+    return results[0][0]
+
+print "full name for dyer:", get_name('survey.db', 'dyer')
+
+ +
+
full name for dyer: William Dyer
+
+
+ + +
+

The key changes are in the query string and the execute call. Instead of formatting the query ourselves, we put question marks in the query template where we want to insert values. When we call execute, we provide a list that contains as many values as there are question marks in the query. The library matches values to question marks in order, and translates any special characters in the values into their escaped equivalents so that they are safe to use.

+
+ + +
+

Challenges

+
    +
  1. Write a Python program that creates a new database in a file called original.db containing a single table called Pressure, with a single field called reading, and inserts 100,000 random numbers between 10.0 and 25.0. How long does it take this program to run? How long does it take to run a program that simply writes those random numbers to a file?

  2. +
  3. Write a Python program that creates a new database called backup.db with the same structure as original.db and copies all the values greater than 20.0 from original.db to backup.db. Which is faster: filtering values in the query, or reading everything into memory and filtering in Python?

  4. +
+
+ + +
+

Key Points

+
    +
  • We usually write database applications in a general-purpose language, and embed SQL queries in it.
  • +
  • To connect to a database, a program must use a library specific to that database manager.
  • +
  • A program may open one or more connections to a single database, and have one or more cursors active in each.
  • +
  • Programs can read query results in batches or all at once.
  • +
+
diff --git a/css/lesson.css b/css/lesson.css index b97b17a25..0b914df84 100644 --- a/css/lesson.css +++ b/css/lesson.css @@ -51,6 +51,7 @@ div.out:before { /* Error output. */ .err { color: darkred; + margin-left: 20px; font-style: italic; font-weight: bold; } diff --git a/gloss.md b/gloss.md index 2ef80e012..c40a1e104 100644 --- a/gloss.md +++ b/gloss.md @@ -7,7 +7,7 @@ title: Glossary A [path](#path) that refers to a particular location in a file system. Absolute paths are usually written with respect to the file system's [root directory](#root-directory), -and begin with either "/" (on Unix) or "\" (on Microsoft Windows). +and begin with either "/" (on Unix) or "\\" (on Microsoft Windows). See also: [relative path](#relative-path). **additive color model**: @@ -381,7 +381,7 @@ A collection of data organized into [tables](#table-database). **relative path**: A [path](#path) that specifies the location of a file or directory with respect to the [current working directory](#current-working-directory). -Any path that does not begin with a separator character ("/" or "\") is a relative path. +Any path that does not begin with a separator character ("/" or "\\") is a relative path. See also: [absolute path](#absolute-path). **remote repository**: @@ -411,7 +411,7 @@ Each color's value is typically in the range 0..255 **root directory**: The top-most directory in a [filesystem](#filesystem). -Its name is "/" on Unix (including Linux and Mac OS X) and "\" on Microsoft Windows. +Its name is "/" on Unix (including Linux and Mac OS X) and "\\" on Microsoft Windows. **sentinel value**: A value in a collection that has a special meaning, diff --git a/novice/extras/01-branching.md b/novice/extras/01-branching.md index 4cdc7dbd9..c1b99114d 100644 --- a/novice/extras/01-branching.md +++ b/novice/extras/01-branching.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Branching in Git -level: novice --- Here's where we are right now: diff --git a/novice/extras/02-review.md b/novice/extras/02-review.md index 1900d6c1e..791c3d988 100644 --- a/novice/extras/02-review.md +++ b/novice/extras/02-review.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Code Review -level: novice --- The model shown in the previous section, in which everyone pushes and pulls from a single repository, diff --git a/novice/extras/03-permissions.md b/novice/extras/03-permissions.md index 99af175ef..88e99b75d 100644 --- a/novice/extras/03-permissions.md +++ b/novice/extras/03-permissions.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Permissions -level: novice --- It's now time to look at how Unix determines who can see the contents of which files, who can *change* those files, diff --git a/novice/extras/04-shellvar.md b/novice/extras/04-shellvar.md index ed1cfefba..ec17c3d39 100644 --- a/novice/extras/04-shellvar.md +++ b/novice/extras/04-shellvar.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Shell Variables -level: novice --- The shell is just a program, and like other programs, it has variables. Those variables control its execution, diff --git a/novice/extras/05-ssh.md b/novice/extras/05-ssh.md index 3f84aea80..d50764937 100644 --- a/novice/extras/05-ssh.md +++ b/novice/extras/05-ssh.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Working Remotely -level: novice --- Let's take a closer look at what happens when we use a desktop or laptop computer. The first step is to log in diff --git a/novice/extras/06-alias.md b/novice/extras/06-alias.md index 9c59de557..6763fa8f5 100644 --- a/novice/extras/06-alias.md +++ b/novice/extras/06-alias.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Aliasing -level: novice --- At this point, we need to take a small side trip to explore something which is very useful, diff --git a/novice/extras/07-exceptions.md b/novice/extras/07-exceptions.md index f296dbf54..0b13d0afc 100644 --- a/novice/extras/07-exceptions.md +++ b/novice/extras/07-exceptions.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Exceptions -level: novice --- Assertions help us catch errors in our code, but things can go wrong for other reasons, @@ -75,7 +74,7 @@ try: reader = open('nonexistent-file.txt', 'r') except IOError: print 'Whoops!' -Whoops! +Whoops! ~~~ When Python executes this code, diff --git a/novice/extras/08-unit.md b/novice/extras/08-unit.md index fd719d450..399c6ad9e 100644 --- a/novice/extras/08-unit.md +++ b/novice/extras/08-unit.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Unit Testing -level: novice --- Most people don't enjoy writing tests, so if we want them to actually do it, diff --git a/novice/extras/09-debugger.md b/novice/extras/09-debugger.md index ce01e0bc8..43181bc8e 100644 --- a/novice/extras/09-debugger.md +++ b/novice/extras/09-debugger.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Using a Debugger -level: novice --- Programmers spend a lot of time debugging, so it's worth learning how to do it systematically. diff --git a/novice/extras/10-numbers.md b/novice/extras/10-numbers.md index d28fec100..7f4b4ac04 100644 --- a/novice/extras/10-numbers.md +++ b/novice/extras/10-numbers.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Numbers -level: novice --- Let's start by looking at how numbers are stored. If we only have the two digits 0 and 1, diff --git a/novice/extras/11-human.md b/novice/extras/11-human.md index a3b5c8ed4..f015f5b9b 100644 --- a/novice/extras/11-human.md +++ b/novice/extras/11-human.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: The Human Side of Things -level: novice --- In my experience, the things that go wrong most often in software development projects—undergraduate or professional—have nothing to diff --git a/novice/extras/12-why.md b/novice/extras/12-why.md index 94e4edfad..5260f7df4 100644 --- a/novice/extras/12-why.md +++ b/novice/extras/12-why.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Why I Teach -level: novice --- When I was your age, I thought universities existed to teach people how to learn. Later, when I was in grad school, I thought diff --git a/novice/extras/fixme-man.md b/novice/extras/fixme-man.md index eb466e5f3..5de1e9c7d 100644 --- a/novice/extras/fixme-man.md +++ b/novice/extras/fixme-man.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Manual Pages -level: novice --- You can get help for any Unix command with the `man` (short for manual) command. As an example here is the command to lookup information on 'cp' diff --git a/novice/extras/index.md b/novice/extras/index.md index cccb6d3d6..2de8e702a 100644 --- a/novice/extras/index.md +++ b/novice/extras/index.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: A Few Extras -level: novice --- A few things come up in our classes that don't fit naturally into the flow of our lessons. diff --git a/novice/git/00-intro.md b/novice/git/00-intro.md index a102ee290..fb311e307 100644 --- a/novice/git/00-intro.md +++ b/novice/git/00-intro.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Introducing Version Control -level: novice --- Wolfman and Dracula have been hired by Universal Missions (a space services spinoff from Euphoric State University) diff --git a/novice/git/01-backup.md b/novice/git/01-backup.md index e97b093fa..c067cdd23 100644 --- a/novice/git/01-backup.md +++ b/novice/git/01-backup.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: A Better Kind of Backup -level: novice ---
diff --git a/novice/git/02-collab.md b/novice/git/02-collab.md index 5db837f75..3c7d7d57c 100644 --- a/novice/git/02-collab.md +++ b/novice/git/02-collab.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Collaborating -level: novice ---
diff --git a/novice/git/03-conflict.md b/novice/git/03-conflict.md index 2de353a86..50aabbef1 100644 --- a/novice/git/03-conflict.md +++ b/novice/git/03-conflict.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Conflicts -level: novice ---
diff --git a/novice/git/04-open.md b/novice/git/04-open.md index 2438b20dc..f35c2e0ca 100644 --- a/novice/git/04-open.md +++ b/novice/git/04-open.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Open Science -level: novice ---
diff --git a/novice/git/index.md b/novice/git/index.md index fa127494a..6aa2b1370 100644 --- a/novice/git/index.md +++ b/novice/git/index.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Version Control with Git -level: novice --- Version control is the lab notebook of the digital world: it's what professionals use to keep track of what they've done diff --git a/novice/python/index.md b/novice/python/index.md index 26a36acd3..6ef6e5baa 100644 --- a/novice/python/index.md +++ b/novice/python/index.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Programming with Python -level: novice --- The best way to learn how to program is to do something useful, so this introduction to Python is built around a common scientific task: diff --git a/novice/r/README.md b/novice/r/README.md index 1c82f8d82..2c5e51df0 100644 --- a/novice/r/README.md +++ b/novice/r/README.md @@ -2,6 +2,5 @@ layout: lesson root: ../.. title: Programming with R -level: novice --- FIXME: to be written. diff --git a/novice/ref/01-shell.md b/novice/ref/01-shell.md index 2874d3d46..0aa49052a 100644 --- a/novice/ref/01-shell.md +++ b/novice/ref/01-shell.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Shell Reference -level: novice --- #### Basic Commands diff --git a/novice/ref/02-git.md b/novice/ref/02-git.md index d2d835d3b..c1367dfb2 100644 --- a/novice/ref/02-git.md +++ b/novice/ref/02-git.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Git Reference -level: novice --- Set global configuration (only needs to be done once per machine): diff --git a/novice/ref/03-python.md b/novice/ref/03-python.md index 57063e8cc..ec973a775 100644 --- a/novice/ref/03-python.md +++ b/novice/ref/03-python.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Python Reference -level: novice --- #### Basic Operations diff --git a/novice/ref/04-sql.md b/novice/ref/04-sql.md index eeaa3a605..304027f07 100644 --- a/novice/ref/04-sql.md +++ b/novice/ref/04-sql.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: SQL Reference -level: novice --- #### Basic Queries diff --git a/novice/ref/index.md b/novice/ref/index.md index d0cd7eef5..0a20aaa50 100644 --- a/novice/ref/index.md +++ b/novice/ref/index.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Reference -level: novice --- These short reference guides cover the basic tools and ideas introduced in our lessons. diff --git a/novice/shell/00-intro.md b/novice/shell/00-intro.md index 33842e716..4ec1c7175 100644 --- a/novice/shell/00-intro.md +++ b/novice/shell/00-intro.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Introducing the Shell -level: novice ---
diff --git a/novice/shell/01-filedir.md b/novice/shell/01-filedir.md index 7655ab3bd..c35bab095 100644 --- a/novice/shell/01-filedir.md +++ b/novice/shell/01-filedir.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Files and Directories -level: novice ---
diff --git a/novice/shell/02-create.md b/novice/shell/02-create.md index 8bd67bc84..814c55961 100644 --- a/novice/shell/02-create.md +++ b/novice/shell/02-create.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Creating Things -level: novice ---
diff --git a/novice/shell/03-pipefilter.md b/novice/shell/03-pipefilter.md index 25782cb84..68017f814 100644 --- a/novice/shell/03-pipefilter.md +++ b/novice/shell/03-pipefilter.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Pipes and Filters -level: novice ---
diff --git a/novice/shell/04-loop.md b/novice/shell/04-loop.md index 42c4cc495..3bf3f2f2b 100644 --- a/novice/shell/04-loop.md +++ b/novice/shell/04-loop.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Loops -level: novice ---
diff --git a/novice/shell/05-script.md b/novice/shell/05-script.md index 43d6eb283..785f12c3f 100644 --- a/novice/shell/05-script.md +++ b/novice/shell/05-script.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Shell Scripts -level: novice ---
diff --git a/novice/shell/06-find.md b/novice/shell/06-find.md index 2b6d0bb31..9e07a7605 100644 --- a/novice/shell/06-find.md +++ b/novice/shell/06-find.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Finding Things -level: novice ---
diff --git a/novice/shell/index.md b/novice/shell/index.md index befd5edf4..1479c1942 100644 --- a/novice/shell/index.md +++ b/novice/shell/index.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: The Unix Shell -level: novice --- The Unix shell has been around longer than most of its users have been alive. It has survived so long because it's a power tool diff --git a/novice/sql/index.md b/novice/sql/index.md index aded47f2c..a4bcf12ed 100644 --- a/novice/sql/index.md +++ b/novice/sql/index.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Using Databases and SQL -level: novice --- Almost everyone has used spreadsheets, and almost everyone has eventually run up against their limitations. diff --git a/novice/teaching/01-shell.md b/novice/teaching/01-shell.md index b74592398..031a8f0b9 100644 --- a/novice/teaching/01-shell.md +++ b/novice/teaching/01-shell.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: The Unix Shell -level: novice --- Many people have questioned whether we should still teach the shell. After all, diff --git a/novice/teaching/02-git.md b/novice/teaching/02-git.md index ac2b314eb..6dd2fd722 100644 --- a/novice/teaching/02-git.md +++ b/novice/teaching/02-git.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Version Control with Git -level: novice --- Version control might be the most important topic we teach, but Git is definitely the most complicated tool. diff --git a/novice/teaching/03-python.md b/novice/teaching/03-python.md index e26cfbcad..a119f06f7 100644 --- a/novice/teaching/03-python.md +++ b/novice/teaching/03-python.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Programming with Python -level: novice --- This lesson is written as an introduction to Python, but its real purpose is to introduce the single most important idea in programming: diff --git a/novice/teaching/04-sql.md b/novice/teaching/04-sql.md index 6fdffb404..fa2878ee8 100644 --- a/novice/teaching/04-sql.md +++ b/novice/teaching/04-sql.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Using Databases and SQL -level: novice --- Relational databases are not as widely used in science as in business, but they are still a common way to store large data sets with complex structure. diff --git a/novice/teaching/index.md b/novice/teaching/index.md index 7e3c2127d..8f675d04f 100644 --- a/novice/teaching/index.md +++ b/novice/teaching/index.md @@ -2,7 +2,6 @@ layout: lesson root: ../.. title: Instructor's Guide -level: novice --- > *1997*