#!/bin/bash # A simple script to backup an organization's GitHub repositories. # Initially from https://gist.github.com/rodw/3073987 snapshot at # https://gist.githubusercontent.com/rodw/3073987/raw/d5e9ab4785647e558df488eb18623aa6c52af86b/backup-github.sh # Continued afterwards 2023+ by Jim Klimov <gmail.com> #------------------------------------------------------------------------------- # NOTES: #------------------------------------------------------------------------------- # * Under the heading "CONFIG" below you'll find a number of configuration # parameters that must be personalized for your GitHub account and org. # Replace the `<CHANGE-ME>` strings with the value described in the comments # (or overwrite those values at run-time by providing environment variables). # # * Your terminal/screen session used for backups would benefit from having # your SSH key preloaded into the ssh-agent, e.g.: # eval `ssh-agent` # ssh-add ~/.ssh/id_rsa # or otherwise made available to the git client (no passphrase? oh no!) # # * If you have more than 100 repositories, the script should be able to # step thru the list of repos returned by GitHub one page at a time, # beware API limits (server-side throttling); maybe support for HTTP-304 # cache would be beneficial (also to avoid fetches that bring no news?) # # * If you want to back up the repos for a USER rather than an ORGANIZATION, # or a user's gists (and their comments), see GHBU_ORGMODE setting below. # # * Thanks to @rodw for the original script, and to @Calrion, @vnaum, # @BartHaagdorens and other commenters in original gist for various fixes # and updates. # # * Also see those comments (and related revisions and forks) for more # information and general troubleshooting. #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # CONFIG: #------------------------------------------------------------------------------- GHBU_ORG=${GHBU_ORG-"<CHANGE-ME>"} # the GitHub organization whose repos will be backed up # # (if you're backing up a USER's repos, this should be your GitHub username; also see the note below about the `REPOLIST` definition) GHBU_UNAME=${GHBU_UNAME-"<CHANGE-ME>"} # the username of a GitHub account (to use with the GitHub API) GHBU_PASSWD=${GHBU_PASSWD-"<CHANGE-ME>"} # the password for that account #------------------------------------------------------------------------------- GHBU_ORGMODE=${GHBU_ORGMODE-"org"} # "org", "user" or "gists"? GHBU_BACKUP_DIR=${GHBU_BACKUP_DIR-"github-backups"} # where to place the backup files; avoid using ":" in the name (confuses tar as a hostname; confuses Windows as a drive letter) GHBU_GITHOST=${GHBU_GITHOST-"github.com"} # the GitHub hostname (see comments) GHBU_REUSE_REPOS=${GHBU_REUSE_REPOS-false} # as part of backup process, we mirror-clone remote git repos; should we keep and reuse them for next backups (true), or always snatch from scratch (false)? GHBU_TARBALL_REPOS=${GHBU_TARBALL_REPOS-true} # when `true`, tarballs for each backed-up repository would be made; when false (e.g. if persistent repos due to GHBU_REUSE_REPOS=true suffice), only non-git items would be tarballed (issues, comments, metadata) GHBU_PRUNE_INCOMPLETE=${GHBU_PRUNE_INCOMPLETE-false} # when `true`, backups named like *.__WRITING__ will be deleted when script starts (set `false` if using same GHBU_BACKUP_DIR for several scripts running in parallel) GHBU_PRUNE_PREV=${GHBU_PRUNE_PREV-false} # when `true`, only the "*.latest.tar.gz" backups will be tracked; when `false` also the `*.prev.tar.gz" GHBU_PRUNE_OLD=${GHBU_PRUNE_OLD-true} # when `true`, old backups will be deleted GHBU_PRUNE_AFTER_N_DAYS=${GHBU_PRUNE_AFTER_N_DAYS-3} # the min age (in days) of backup files to delete GHBU_SILENT=${GHBU_SILENT-false} # when `true`, only show error messages GHBU_API=${GHBU_API-"https://api.github.com"} # base URI for the GitHub API GHBU_GIT_CLONE_CMD="GITCMD clone --quiet --mirror " # base command to use to clone GitHub repos from an URL (may need more info for SSH) GHBU_GIT_CLONE_CMD_SSH="${GHBU_GIT_CLONE_CMD} git@${GHBU_GITHOST}:" # base command to use to clone GitHub repos over SSH GHBU_FAILFAST_GETGIT="${GHBU_FAILFAST_GETGIT-true}" # if true, repeated failure of getgit() will fail the script; if false - go over other repos TSTAMP="`TZ=UTC date "+%Y%m%dT%H%MZ"`" # format of timestamp suffix appended to archived files #------------------------------------------------------------------------------- # (end config) #------------------------------------------------------------------------------- # The function `check` will exit the script if the given command fails. function check { "$@" status=$? if [ $status -ne 0 ]; then echo "ERROR: Encountered error (${status}) while running the following:" >&2 echo " $@" >&2 echo " (at line ${BASH_LINENO[0]} of file $0.)" >&2 echo " Aborting." >&2 exit $status fi } # The function `check_unexpected_success` will exit the script if the given command DOES NOT FAIL. function check_unexpected_success { "$@" status=$? if [ $status -eq 0 ]; then echo "ERROR: Encountered error (unexpected success) while running the following:" >&2 echo " $@" >&2 echo " (at line ${BASH_LINENO[0]} of file $0.)" >&2 echo " Aborting." >&2 exit 1 fi } # The function `tgz` will create a gzipped tar archive of the specified # file ($1) and then optionally remove the original function tgz { $GHBU_TARBALL_REPOS || return 0 check tar zcf "$1.$TSTAMP.tar.gz.__WRITING__" "$1" \ && check mv -f "$1.$TSTAMP.tar.gz.__WRITING__" "$1.$TSTAMP.tar.gz" \ || return if ! $GHBU_REUSE_REPOS ; then check rm -rf "$1" fi # Let one copy (or two if "prev" is used) survive the auto-prune if $GHBU_PRUNE_PREV ; then rm -f "$1.prev.tar.gz" || true rm -f "$1.latest.tar.gz" || true else if [ -e "$1.latest.tar.gz" ] ; then mv -f "$1.latest.tar.gz" "$1.prev.tar.gz" || true fi fi check ln "$1.$TSTAMP.tar.gz" "$1.latest.tar.gz" } # Shortcut for non-repo items (issues, comments, metadata JSONs...) function tgz_nonrepo { GHBU_TARBALL_REPOS=true \ GHBU_REUSE_REPOS=false \ tgz "$@" } # Optionally delete files with a __WRITING__ extension, # likely abandoned due to run-time errors like a reboot, # Ctrl+C, connection loss, disk space... function prune_incomplete { if $GHBU_PRUNE_INCOMPLETE ; then $GHBU_SILENT || (echo "" && echo "=== PRUNING INCOMPLETE LEFTOVERS (if any) ===" && echo "") $GHBU_SILENT || echo "Found `find $GHBU_BACKUP_DIR -maxdepth 1 -name '*.__WRITING__*' | wc -l` files to prune." find $GHBU_BACKUP_DIR -maxdepth 1 -name '*.__WRITING__*' -exec rm -fv {} > /dev/null \; $GHBU_SILENT || (echo "" && echo "=== PRUNING FINISHED ===" && echo "") fi } # The function `getdir` will return the repo directory name on stdout # if successful (it depends on GHBU_REUSE_REPOS value). function getdir { local REPOURI="$1" local DIRNAME REPOURI="$(echo "$REPOURI" | sed 's,^https://gist.github.com/\(.*\)$,gist-\1,')" # Note our caller adds a ".comments" suffix; another may be from API, orig: # https://api.github.com/gists/b92a4fe5bb8eab70e79d6f1581563863/comments REPOURI="$(echo "$REPOURI" | sed 's,^'"$GHBU_API"'/gists/\([^/]*\)/comments\(\.comments\)*$,gist-\1-comments,')" if $GHBU_REUSE_REPOS ; then DIRNAME="${GHBU_BACKUP_DIR}/${GHBU_ORG}-${REPOURI}" fi if ! $GHBU_REUSE_REPOS ; then DIRNAME="${DIRNAME}-${TSTAMP}" fi case "$REPOURI" in *.git) ;; *) DIRNAME="${DIRNAME}.git" ;; esac echo "$DIRNAME" } # See coments below function GITCMD { if [ -z "$CRED_HELPER" ] ; then git "$@" else # Note the deletion of credential.helper first, it is multivalued # Per https://stackoverflow.com/a/70963737 explanation: # > The additional empty value for credential.helper causes any # > existing credential helpers to be removed, preventing the # > addition of this token into the user's credential helper. # > If you'd like the user to be able to save it, then remove # > that directive. # In our case, the first HTTP(S) download would block asking for # user/pass; however a "credential.helper=cache --timeout=360000" # might help subsequent activities (and/or corrupt them, if we use # different credentials for backups and/or interactive development). git -c credential.helper= -c credential.helper="$CRED_HELPER" "$@" fi } # The function `getgit` will clone (or update) specified repo ($1, without # a `.git` suffix) into specified directory ($2) function getgit ( # Sub-shelled to constrain "export" visibility of credentials local REPOURI="$1" local DIRNAME="$2" case x"$1" in xhttp://*|xhttps://*) # Prepare HTTP(S) credential support for this git operation. local CRED_HELPER='!f() { echo "username=$GHBU_UNAME"; echo "password=$GHBU_PASSWD"; }; f' export CRED_HELPER GHBU_UNAME GHBU_PASSWD ;; esac if $GHBU_REUSE_REPOS && [ -d "${DIRNAME}" ] ; then # Update an existing repo (if reusing) $GHBU_SILENT || echo "... Updating $REPOURI clone in $DIRNAME" # Note "fatal: fetch --all does not make sense with refspecs" likely in the mirror: (cd "${DIRNAME}" && { GITCMD fetch --quiet --tags \ && GITCMD fetch --quiet --all \ || GITCMD fetch --quiet }) || return else $GHBU_SILENT || echo "... Cloning $REPOURI into $DIRNAME" case x"$1" in x*@*|x*://*) # URL already; .git suffix should be irrelevant ${GHBU_GIT_CLONE_CMD} "${REPOURI}" "${DIRNAME}" || return ;; *) # Just a repo name - complete it with data we know of ${GHBU_GIT_CLONE_CMD_SSH}"${GHBU_ORG}/${REPOURI}.git" "${DIRNAME}" \ || { # Errors were seen above, so no GHBU_SILENT here: echo "..... Attempt a retry over HTTPS" >&2 # FIXME: Effectively we craft the clone_url # here, rather than using one from metadata ${GHBU_GIT_CLONE_CMD} "https://${GHBU_GITHOST}/${GHBU_ORG}/${REPOURI}" "${DIRNAME}" \ && echo "..... Attempt a retry over HTTPS: SUCCEEDED" >&2 ; \ } || return ;; esac fi # Return a success either way here: $GHBU_SILENT || echo "+++ Received $REPOURI into $DIRNAME" ) function filter_user_org { # Might be better off getting a "clone_url" here, but so far our # directory naming etc. rely on the "REPONAME" value received here: check grep '^ "name"' | check awk -F': "' '{print $2}' | check sed -e 's/",//g' } function filter_gist { check sed -n 's/.*git_pull_url": "\(.*\)",/\1/p' } function filter_gist_comments { check sed -n 's/.*comments_url": "\(.*\)",/\1/p' } function get_multipage_file { # Uses caller envvars FILENAME, APIURL, APIQUERY_SUFFIX, ENTRYID_REGEX # Returns MULTIPAGE_NUM, MULTIPAGE_OK, ENTRY_COUNT # Assumes GNU date or compatible - try to avoid extra traffic for us # and load for servers (and REST API quota hits), by asking GitHub # REST API directly: "If-Modified-Since" we last fetched the document: [ -s "$FILENAME" ] && FILEDATE="`date -R -u -r "$FILENAME" | sed 's,\+0000,GMT,'`" || FILEDATE="" ETAGS="`dirname "$FILENAME"`/etags.cache" [ -s "$ETAGS" ] && FILEETAG="`grep -E "\t$APIURL\$" < "$ETAGS" | awk '{print $1}'`" || FILEETAG="" MULTIPAGE_NUM=1 MULTIPAGE_OK=true ENTRY_COUNT=0 while : ; do # ex. APIURL="${GHBU_API}/repos/${GHBU_ORG}/${REPO}/issues" # ex. APIQUERY_SUFFIX="&state=all" # Also note: https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#user-agent-required # hence `the User-Agent: username` however it causes JSON # markup not prettified for humans (and line-based grep) # Ordering defaults to showing newest issues first # TODO: Can we impact ordering of comments so newest # changed would be first and we know from the first page # that we need to re-fetch this?.. CURLRES=0 rm -f "${FILENAME}.headers" || true WANT_HEADER="" if [ 1 = "$MULTIPAGE_NUM" ] && ( [ -n "$FILEETAG" ] || [ -n "$FILEDATE" ] ) ; then WANT_HEADER="yes" fi curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" \ -H "User-Agent: ${GHBU_UNAME}" \ ${FILEETAG+-H "If-None-Match: ${FILEETAG}"} \ ${FILEDATE+-H "If-Modified-Since: ${FILEDATE}"} \ ${WANT_HEADER+-D "${FILENAME}.headers"} \ "${APIURL}?per_page=100&page=${MULTIPAGE_NUM}${APIQUERY_SUFFIX}" -q \ > "${FILENAME}.__WRITING__.tmp" \ || { echo "FAILED to fetch '${APIURL}' once: will sleep in case it is about the usage quota and try again" sleep 120 check curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" \ -H "User-Agent: ${GHBU_UNAME}" \ ${FILEETAG+-H "If-None-Match: ${FILEETAG}"} \ ${FILEDATE+-H "If-Modified-Since: ${FILEDATE}"} \ ${WANT_HEADER+-D "${FILENAME}.headers"} \ "${APIURL}?per_page=100&page=${MULTIPAGE_NUM}${APIQUERY_SUFFIX}" -q \ > "${FILENAME}.__WRITING__.tmp" } || CURLRES=$? # NOTE: Value may include quotes; header is posted with them! NEWETAG="`grep -i '^etag:' "${FILENAME}.headers" | sed 's,^[Ee][Tt][Aa][Gg]: *,,' | tr -d '\r' | tr -d '\n'`" || NEWETAG="" if [ -n "$NEWETAG" ] && [ x"$NEWETAG" != x"$FILEETAG" ] ; then printf '%s\t%s\n' "$NEWETAG" "$APIURL" > "$ETAGS.tmp" if [ -s "$ETAGS" ]; then grep -v "$APIURL" "$ETAGS" >> "$ETAGS.tmp" ; fi mv "$ETAGS.tmp" "$ETAGS" fi if head -1 "${FILENAME}.headers" | grep -E "HTTP.*304" > /dev/null ; then # Let HTTP-304 skips be seen $GHBU_SILENT || echo "SKIP: (First page of) the requested resource did not change" rm -f "${FILENAME}.headers" "${FILENAME}.__WRITING__.tmp" MULTIPAGE_OK=true ENTRY_COUNT="`grep -Ec "$ENTRYID_REGEX" < "${FILENAME}"`" break fi # Only ask for first page unset FILEDATE unset FILEETAG if [ $CURLRES != 0 ] ; then MULTIPAGE_OK=false break fi if [ 1 = "$MULTIPAGE_NUM" ] ; then # Not cached => replace rm -f "$FILENAME" || true fi # Produce pretty JSON we can grep in, and otherwise manipulate below: jq < "${FILENAME}.__WRITING__.tmp" > "${FILENAME}.__WRITING__" rm -f "${FILENAME}.__WRITING__.tmp" if [ 1024 > "`wc -c < "${FILENAME}.__WRITING__"`" ] \ && grep -q '"message": "API rate limit exceeded' "${FILENAME}.__WRITING__" \ ; then echo "FAILED to fetch '${APIURL}': got some contents but they are short and only say that API rate limit exceeded" MULTIPAGE_OK=false break fi # ex. ENTRYID_REGEX='"url": "'"${GHBU_API}/repos/${GHBU_ORG}/${REPO}/issues/[0123456789]+"'"' NUM="`grep -Ec "$ENTRYID_REGEX" < "${FILENAME}.__WRITING__"`" ENTRY_COUNT="`expr $ENTRY_COUNT + $NUM`" if [ ! -e "${FILENAME}" ] ; then check mv -f "${FILENAME}.__WRITING__" "${FILENAME}" else (head -n -2 "${FILENAME}" && \ echo " }," && \ tail -n +2 "${FILENAME}.__WRITING__" ) > "${FILENAME}.__WRITING__.tmp" \ || { MULTIPAGE_OK=false; break; } mv -f "${FILENAME}.__WRITING__.tmp" "${FILENAME}" rm -f "${FILENAME}.__WRITING__" fi if [ $NUM -lt 100 ] ; then # Last page break fi MULTIPAGE_NUM="`expr $MULTIPAGE_NUM + 1`" done rm -f "${FILENAME}.headers" # Return status $MULTIPAGE_OK } check command -v jq $GHBU_SILENT || (echo "" && echo "=== INITIALIZING ===" && echo "") $GHBU_SILENT || echo "Using backup directory $GHBU_BACKUP_DIR" check mkdir -p $GHBU_BACKUP_DIR prune_incomplete $GHBU_SILENT || echo -n "Fetching list of repositories for ${GHBU_ORG}..." case x"$GHBU_ORGMODE" in x"gist"|x"gists") GHBU_ORG_URI="/gists" ;; x"user"|x"users") # NOTE: if you're backing up a *user's* repos, not an organizations, use this instead: if [ "${GHBU_ORG}" = "${GHBU_UNAME}" ] ; then # Backing up ourselves GHBU_ORG_URI="/user" else # Backing up a friend/alter-ego (using our login) GHBU_ORG_URI="/users/${GHBU_ORG}" fi ;; x"org"|*) # Legacy default GHBU_ORG_URI="/orgs/${GHBU_ORG}" ;; esac # Be sure to stash a fresh copy, even if previous backup was interrupted rm -f "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json.__WRITING__" touch "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json.__WRITING__" GIST_COMMENTLIST="" GIST_COMMENTLIST_PAGE="" REPOLIST="" REPOLIST_PAGE="" PAGENUM=1 # TODO: Convert to get_multipage_file()? while : ; do JSON="" case x"$GHBU_ORGMODE" in xorg*|xuser*) # hat tip to https://gist.github.com/rodw/3073987#gistcomment-3217943 for the license name workaround # The "type=owner" should be default per https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repositories-for-a-user # but with a powerful token a "user" backup may see all repos # one has access to (collaborating in other orgs). Other than # lots of noise and more time to get the listing, this leads # to broken backup cycles when we try to fetch repo names that # are not known under this user's personal namespace. # Note: user-agent causes compacted JSON we can not grep in => jq JSON="$(curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" -H "User-Agent: ${GHBU_UNAME}" "${GHBU_API}${GHBU_ORG_URI}/repos?per_page=100&page=$PAGENUM&type=owner" -q)" check [ "$?" = 0 ] JSON="$(echo "$JSON" | check jq)" echo "$JSON" | grep "API rate limit" && check false REPOLIST_PAGE="$(echo "$JSON" | filter_user_org)" ;; xgist*) JSON="$(curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" -H "User-Agent: ${GHBU_UNAME}" "${GHBU_API}${GHBU_ORG_URI}?per_page=100&page=$PAGENUM" -q)" check [ "$?" = 0 ] JSON="$(echo "$JSON" | check jq)" echo "$JSON" | grep "API rate limit" && check false REPOLIST_PAGE="$(echo "$JSON" | filter_gist)" GIST_COMMENTLIST_PAGE="$(echo "$JSON" | filter_gist_comments)" ;; esac # Not exactly a valid JSON file (gets lists just concatenated), but # so far good enough to grep it, and some parsers treat it well too echo "$JSON" >> "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json.__WRITING__" if [ -z "$REPOLIST" ] ; then REPOLIST="$REPOLIST_PAGE" else REPOLIST="$REPOLIST $REPOLIST_PAGE" fi if [ -z "$GIST_COMMENTLIST" ] ; then GIST_COMMENTLIST="$GIST_COMMENTLIST_PAGE" else GIST_COMMENTLIST="$GIST_COMMENTLIST $GIST_COMMENTLIST_PAGE" fi if [ 100 -ne `echo $REPOLIST_PAGE | wc -w` ] ; then break fi PAGENUM=$(($PAGENUM+1)) $GHBU_SILENT || echo -n " Fetching next page of repos: $PAGENUM..." done REPOS_COUNT=0 REPOS_TOTAL="`echo $REPOLIST | wc -w`" $GHBU_SILENT || echo " found $REPOS_TOTAL repositories." mv -f "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json.__WRITING__" "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json" tgz_nonrepo "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json" $GHBU_SILENT || (echo "" && echo "=== BACKING UP ===" && echo "") for REPO in $REPOLIST; do REPOS_COUNT=$(($REPOS_COUNT+1)) $GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO} ($REPOS_COUNT of $REPOS_TOTAL)" DIRNAME="`getdir "$REPO"`" { getgit "${REPO}" "${DIRNAME}" || { echo "FAILED to getgit '${REPO}' '${DIRNAME}': will sleep in case it is about the usage quota and try again" sleep 120 if [ x"$GHBU_FAILFAST_GETGIT" = xtrue ]; then echo "RETRY getgit '${REPO}' '${DIRNAME}': failure now WILL BE FATAL" check getgit "${REPO}" "${DIRNAME}" else echo "RETRY getgit '${REPO}' '${DIRNAME}': failure now will NOT be fatal" getgit "${REPO}" "${DIRNAME}" fi } ; } && tgz "${DIRNAME}" # No wikis nor issues for gists; but there are comments (see another loop) case x"$GHBU_ORGMODE" in xorg*|xuser*) $GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO}.wiki (if any)" DIRNAME="`getdir "$REPO.wiki"`" # Failure is an option for wikis: getgit "${REPO}.wiki" "${DIRNAME}" 2>/dev/null && tgz "${DIRNAME}" # NOTE: While internally issues and PRs seem to be the same, # at least using the same numbering and responding with them # to either URL (can discern by "html_url"), they are both # listed via both numbers: all entities are in "issues" but # only PRs (with partially different metadata) are in "pulls". DIRNAME="`getdir "$REPO.issues-and-pulls"`" $GHBU_SILENT || echo "Preparing local git repo in '${DIRNAME}' to receive issue and PR data" if [ -d "${DIRNAME}/.git" ]; then ( cd "${DIRNAME}" && git checkout -f case $? in 0) git clean -fffdddxxx ;; 128) GITOUT="`git log --oneline -1 2>&1`" # `ls` => empty etags.cache list-issues.json list-pulls.json if [ $? = 128 ] && ( echo "${GITOUT}" | grep -E 'fatal: (your current branch .* does not have any commits yet|You are on a branch yet to be born)' ) && [ `ls -1 | wc -l` = 3 ] ; then $GHBU_SILENT || echo "Removing botched earlier preparation of a local git repo in '${DIRNAME}' to receive issue and PR data" # rm -rf .git BASE_DIRNAME="`basename "${DIRNAME}"`" cd .. && rm -rf "${BASE_DIRNAME}" else exit 128 fi ;; esac ) || check [ "$?" = 0 ] fi if [ ! -d "${DIRNAME}/.git" ]; then check mkdir -p "${DIRNAME}" \ && ( cd "${DIRNAME}" && git init \ && { git config gc.autodetach false ; git config commit.gpgsign false ; true ; } \ && touch list-issues.json list-pulls.json etags.cache \ && git add list-issues.json list-pulls.json etags.cache \ && git commit -m 'Initial commit' \ ) || check [ "$?" = 0 ] fi # List of issues: $GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO} issues" ISSUES_FILENAME="${DIRNAME}/list-issues.json" FILENAME="${ISSUES_FILENAME}" \ APIURL="${GHBU_API}/repos/${GHBU_ORG}/${REPO}/issues" \ APIQUERY_SUFFIX="&state=all" \ ENTRYID_REGEX='"url": *"'"${GHBU_API}/repos/${GHBU_ORG}/${REPO}/issues/[0123456789]+"'"' \ get_multipage_file $GHBU_SILENT || echo "Collected ${ENTRY_COUNT} issues in ${MULTIPAGE_NUM} pages for ${GHBU_ORG}/${REPO}; overall success: ${MULTIPAGE_OK}" $MULTIPAGE_OK && ( cd "${DIRNAME}" && git add "`basename "$ISSUES_FILENAME"`" ) || MULTIPAGE_OK=false ISSUES_OK="$MULTIPAGE_OK" ISSUES_NUM="${ENTRY_COUNT}" # List of PRs: $GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO} pull requests" PULLS_FILENAME="${DIRNAME}/list-pulls.json" FILENAME="${PULLS_FILENAME}" \ APIURL="${GHBU_API}/repos/${GHBU_ORG}/${REPO}/pulls" \ APIQUERY_SUFFIX="&state=all" \ ENTRYID_REGEX='"url": *"'"${GHBU_API}/repos/${GHBU_ORG}/${REPO}/pulls?/[0123456789]+"'"' \ get_multipage_file $GHBU_SILENT || echo "Collected ${ENTRY_COUNT} pull requests in ${MULTIPAGE_NUM} pages for ${GHBU_ORG}/${REPO}; overall success: ${MULTIPAGE_OK}" $MULTIPAGE_OK && ( cd "${DIRNAME}" && git add "`basename "$PULLS_FILENAME"`" ) || MULTIPAGE_OK=false PULLS_OK="$MULTIPAGE_OK" PULLS_NUM="${ENTRY_COUNT}" # Contents ( $ISSUES_OK && grep '"comments_url"' "$ISSUES_FILENAME" || true $PULLS_OK && grep -E '"(comments_url|review_comments_url|commits_url)"' "$PULLS_FILENAME" || true ) | awk '{print $NF}' | sed -e 's,^",,' -e 's/",*$//' | sort -n | uniq | \ while IFS= read SUB_URL \ ; do SUB_FILENAME="`echo "$SUB_URL" | sed -e "s,^${GHBU_API}/repos/${GHBU_ORG}/${REPO}/,," -e 's,[:/],-,g'`.json" # Skip user metadata case "$SUB_FILENAME" in http---*|https---*|*"{-"*"}"*) continue ;; esac $GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO} issue or pull request details from: ${SUB_URL}" FILENAME="${DIRNAME}/${SUB_FILENAME}" APIURL="${SUB_URL}" \ ENTRYID_REGEX='("sha": *"[0-9a-f]{40}"|"url": *"'"${GHBU_API}/repos/${GHBU_ORG}/${REPO}/(issues|pulls)/comments/[0-9]+"'")' \ get_multipage_file \ && ( cd "${DIRNAME}" && git add "${SUB_FILENAME}" ) done $PULLS_OK && $ISSUES_OK \ && ( GITMSG="Update due to backup at `LANG=C LC_ALL=C TZ=UTC date -u`" cd "${DIRNAME}" && \ git add etags.cache && \ { git commit -m "${GITMSG}" || { echo "Retry git commit without GPG" >&2 ; git commit --no-gpg-sign -m "${GITMSG}" ; } ; } ) && tgz "${DIRNAME}" ;; esac done # Assumes GHBU_ORGMODE=gist, but no reason to constrain: COMMENT_COUNT=0 COMMENT_TOTAL="`echo $GIST_COMMENTLIST | wc -w`" for COMMENT_URL in $GIST_COMMENTLIST; do COMMENT_COUNT=$(($COMMENT_COUNT+1)) $GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${COMMENT_URL} comments ($COMMENT_COUNT of $COMMENT_TOTAL)" FILENAME="`getdir "${COMMENT_URL}.comments" | sed 's,.git$,,'`" check curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" \ -H "User-Agent: ${GHBU_UNAME}" \ "${COMMENT_URL}" -q \ > "${FILENAME}.__WRITING__" \ && mv -f "${FILENAME}.__WRITING__" "${FILENAME}" \ && tgz_nonrepo "${FILENAME}" done # NOTE: the "latest" and optional "prev" handling below allows us to leave at # least one (better two) backup tarballs for each timestamped item sequence. # GitHub going AWOL and us deleting all backups after 3 days would be folly! # (Less of a problem if we do keep the repos, but comments/issues/medatata # are still at risk - maybe GIT their evolution locally?) # NOTE: according to `man find` (GNU, comments to `-atime` et al handling), # the fractional parts of "n*24 hours" are ignored, "so to match -atime +1, # a file has to have been accessed at least two days ago". # This way, GHBU_PRUNE_AFTER_N_DAYS=0 only chops files older than 24 hours. if $GHBU_PRUNE_OLD && [ "${GHBU_PRUNE_AFTER_N_DAYS}" -ge 0 ]; then $GHBU_SILENT || (echo "" && echo "=== PRUNING ===" && echo "") $GHBU_SILENT || echo "Pruning backup files ${GHBU_PRUNE_AFTER_N_DAYS} days old or older." $GHBU_SILENT || echo "Found `find $GHBU_BACKUP_DIR -maxdepth 1 -name '*.tar.gz' -a \! -name '*.prev.tar.gz' -a \! -name '*.latest.tar.gz' -mtime +${GHBU_PRUNE_AFTER_N_DAYS} | wc -l` files to prune." find $GHBU_BACKUP_DIR -maxdepth 1 -name '*.tar.gz' -a \! -name '*.prev.tar.gz' -a \! -name '*.latest.tar.gz' -mtime "+${GHBU_PRUNE_AFTER_N_DAYS}" -exec rm -fv {} > /dev/null \; $GHBU_SILENT || (echo "" && echo "=== PRUNING FINISHED ===" && echo "") fi prune_incomplete $GHBU_SILENT || (echo "" && echo "=== DONE ===" && echo "") $GHBU_SILENT || (echo "GitHub backup for ${GHBU_ORG} (${GHBU_ORGMODE}) completed." && echo "")