-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbackup-github.sh
executable file
·611 lines (549 loc) · 28.7 KB
/
backup-github.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
#!/bin/bash
# A simple script to backup an organization's GitHub repositories.
# Initially from https://gist.github.com/rodw/3073987 snapshot at
# https://gist.githubusercontent.com/rodw/3073987/raw/d5e9ab4785647e558df488eb18623aa6c52af86b/backup-github.sh
# Continued afterwards 2023+ by Jim Klimov <gmail.com>
#-------------------------------------------------------------------------------
# NOTES:
#-------------------------------------------------------------------------------
# * Under the heading "CONFIG" below you'll find a number of configuration
# parameters that must be personalized for your GitHub account and org.
# Replace the `<CHANGE-ME>` strings with the value described in the comments
# (or overwrite those values at run-time by providing environment variables).
#
# * Your terminal/screen session used for backups would benefit from having
# your SSH key preloaded into the ssh-agent, e.g.:
# eval `ssh-agent`
# ssh-add ~/.ssh/id_rsa
# or otherwise made available to the git client (no passphrase? oh no!)
#
# * If you have more than 100 repositories, the script should be able to
# step thru the list of repos returned by GitHub one page at a time,
# beware API limits (server-side throttling); maybe support for HTTP-304
# cache would be beneficial (also to avoid fetches that bring no news?)
#
# * If you want to back up the repos for a USER rather than an ORGANIZATION,
# or a user's gists (and their comments), see GHBU_ORGMODE setting below.
#
# * Thanks to @rodw for the original script, and to @Calrion, @vnaum,
# @BartHaagdorens and other commenters in original gist for various fixes
# and updates.
#
# * Also see those comments (and related revisions and forks) for more
# information and general troubleshooting.
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# CONFIG:
#-------------------------------------------------------------------------------
GHBU_ORG=${GHBU_ORG-"<CHANGE-ME>"} # the GitHub organization whose repos will be backed up
# # (if you're backing up a USER's repos, this should be your GitHub username; also see the note below about the `REPOLIST` definition)
GHBU_UNAME=${GHBU_UNAME-"<CHANGE-ME>"} # the username of a GitHub account (to use with the GitHub API)
GHBU_PASSWD=${GHBU_PASSWD-"<CHANGE-ME>"} # the password for that account
#-------------------------------------------------------------------------------
GHBU_ORGMODE=${GHBU_ORGMODE-"org"} # "org", "user" or "gists"?
GHBU_BACKUP_DIR=${GHBU_BACKUP_DIR-"github-backups"} # where to place the backup files; avoid using ":" in the name (confuses tar as a hostname; confuses Windows as a drive letter)
GHBU_GITHOST=${GHBU_GITHOST-"github.com"} # the GitHub hostname (see comments)
GHBU_REUSE_REPOS=${GHBU_REUSE_REPOS-false} # as part of backup process, we mirror-clone remote git repos; should we keep and reuse them for next backups (true), or always snatch from scratch (false)?
GHBU_TARBALL_REPOS=${GHBU_TARBALL_REPOS-true} # when `true`, tarballs for each backed-up repository would be made; when false (e.g. if persistent repos due to GHBU_REUSE_REPOS=true suffice), only non-git items would be tarballed (issues, comments, metadata)
GHBU_PRUNE_INCOMPLETE=${GHBU_PRUNE_INCOMPLETE-false} # when `true`, backups named like *.__WRITING__ will be deleted when script starts (set `false` if using same GHBU_BACKUP_DIR for several scripts running in parallel)
GHBU_PRUNE_PREV=${GHBU_PRUNE_PREV-false} # when `true`, only the "*.latest.tar.gz" backups will be tracked; when `false` also the `*.prev.tar.gz"
GHBU_PRUNE_OLD=${GHBU_PRUNE_OLD-true} # when `true`, old backups will be deleted
GHBU_PRUNE_AFTER_N_DAYS=${GHBU_PRUNE_AFTER_N_DAYS-3} # the min age (in days) of backup files to delete
GHBU_SILENT=${GHBU_SILENT-false} # when `true`, only show error messages
GHBU_API=${GHBU_API-"https://api.github.com"} # base URI for the GitHub API
GHBU_GIT_CLONE_CMD="GITCMD clone --quiet --mirror " # base command to use to clone GitHub repos from an URL (may need more info for SSH)
GHBU_GIT_CLONE_CMD_SSH="${GHBU_GIT_CLONE_CMD} git@${GHBU_GITHOST}:" # base command to use to clone GitHub repos over SSH
GHBU_FAILFAST_GETGIT="${GHBU_FAILFAST_GETGIT-true}" # if true, repeated failure of getgit() will fail the script; if false - go over other repos
TSTAMP="`TZ=UTC date "+%Y%m%dT%H%MZ"`" # format of timestamp suffix appended to archived files
#-------------------------------------------------------------------------------
# (end config)
#-------------------------------------------------------------------------------
# The function `check` will exit the script if the given command fails.
function check {
"$@"
status=$?
if [ $status -ne 0 ]; then
echo "ERROR: Encountered error (${status}) while running the following:" >&2
echo " $@" >&2
echo " (at line ${BASH_LINENO[0]} of file $0.)" >&2
echo " Aborting." >&2
exit $status
fi
}
# The function `check_unexpected_success` will exit the script if the given command DOES NOT FAIL.
function check_unexpected_success {
"$@"
status=$?
if [ $status -eq 0 ]; then
echo "ERROR: Encountered error (unexpected success) while running the following:" >&2
echo " $@" >&2
echo " (at line ${BASH_LINENO[0]} of file $0.)" >&2
echo " Aborting." >&2
exit 1
fi
}
# The function `tgz` will create a gzipped tar archive of the specified
# file ($1) and then optionally remove the original
function tgz {
$GHBU_TARBALL_REPOS || return 0
check tar zcf "$1.$TSTAMP.tar.gz.__WRITING__" "$1" \
&& check mv -f "$1.$TSTAMP.tar.gz.__WRITING__" "$1.$TSTAMP.tar.gz" \
|| return
if ! $GHBU_REUSE_REPOS ; then
check rm -rf "$1"
fi
# Let one copy (or two if "prev" is used) survive the auto-prune
if $GHBU_PRUNE_PREV ; then
rm -f "$1.prev.tar.gz" || true
rm -f "$1.latest.tar.gz" || true
else
if [ -e "$1.latest.tar.gz" ] ; then
mv -f "$1.latest.tar.gz" "$1.prev.tar.gz" || true
fi
fi
check ln "$1.$TSTAMP.tar.gz" "$1.latest.tar.gz"
}
# Shortcut for non-repo items (issues, comments, metadata JSONs...)
function tgz_nonrepo {
GHBU_TARBALL_REPOS=true \
GHBU_REUSE_REPOS=false \
tgz "$@"
}
# Optionally delete files with a __WRITING__ extension,
# likely abandoned due to run-time errors like a reboot,
# Ctrl+C, connection loss, disk space...
function prune_incomplete {
if $GHBU_PRUNE_INCOMPLETE ; then
$GHBU_SILENT || (echo "" && echo "=== PRUNING INCOMPLETE LEFTOVERS (if any) ===" && echo "")
$GHBU_SILENT || echo "Found `find $GHBU_BACKUP_DIR -maxdepth 1 -name '*.__WRITING__*' | wc -l` files to prune."
find $GHBU_BACKUP_DIR -maxdepth 1 -name '*.__WRITING__*' -exec rm -fv {} > /dev/null \;
$GHBU_SILENT || (echo "" && echo "=== PRUNING FINISHED ===" && echo "")
fi
}
# The function `getdir` will return the repo directory name on stdout
# if successful (it depends on GHBU_REUSE_REPOS value).
function getdir {
local REPOURI="$1"
local DIRNAME
REPOURI="$(echo "$REPOURI" | sed 's,^https://gist.github.com/\(.*\)$,gist-\1,')"
# Note our caller adds a ".comments" suffix; another may be from API, orig:
# https://api.github.com/gists/b92a4fe5bb8eab70e79d6f1581563863/comments
REPOURI="$(echo "$REPOURI" | sed 's,^'"$GHBU_API"'/gists/\([^/]*\)/comments\(\.comments\)*$,gist-\1-comments,')"
if $GHBU_REUSE_REPOS ; then
DIRNAME="${GHBU_BACKUP_DIR}/${GHBU_ORG}-${REPOURI}"
fi
if ! $GHBU_REUSE_REPOS ; then
DIRNAME="${DIRNAME}-${TSTAMP}"
fi
case "$REPOURI" in
*.git) ;;
*) DIRNAME="${DIRNAME}.git" ;;
esac
echo "$DIRNAME"
}
# See coments below
function GITCMD {
if [ -z "$CRED_HELPER" ] ; then
git "$@"
else
# Note the deletion of credential.helper first, it is multivalued
# Per https://stackoverflow.com/a/70963737 explanation:
# > The additional empty value for credential.helper causes any
# > existing credential helpers to be removed, preventing the
# > addition of this token into the user's credential helper.
# > If you'd like the user to be able to save it, then remove
# > that directive.
# In our case, the first HTTP(S) download would block asking for
# user/pass; however a "credential.helper=cache --timeout=360000"
# might help subsequent activities (and/or corrupt them, if we use
# different credentials for backups and/or interactive development).
git -c credential.helper= -c credential.helper="$CRED_HELPER" "$@"
fi
}
# The function `getgit` will clone (or update) specified repo ($1, without
# a `.git` suffix) into specified directory ($2)
function getgit (
# Sub-shelled to constrain "export" visibility of credentials
local REPOURI="$1"
local DIRNAME="$2"
case x"$1" in
xhttp://*|xhttps://*)
# Prepare HTTP(S) credential support for this git operation.
local CRED_HELPER='!f() { echo "username=$GHBU_UNAME"; echo "password=$GHBU_PASSWD"; }; f'
export CRED_HELPER GHBU_UNAME GHBU_PASSWD
;;
esac
if $GHBU_REUSE_REPOS && [ -d "${DIRNAME}" ] ; then
# Update an existing repo (if reusing)
$GHBU_SILENT || echo "... Updating $REPOURI clone in $DIRNAME"
# Note "fatal: fetch --all does not make sense with refspecs" likely in the mirror:
(cd "${DIRNAME}" && {
GITCMD fetch --quiet --tags \
&& GITCMD fetch --quiet --all \
|| GITCMD fetch --quiet
}) || return
else
$GHBU_SILENT || echo "... Cloning $REPOURI into $DIRNAME"
case x"$1" in
x*@*|x*://*) # URL already; .git suffix should be irrelevant
${GHBU_GIT_CLONE_CMD} "${REPOURI}" "${DIRNAME}" || return
;;
*) # Just a repo name - complete it with data we know of
${GHBU_GIT_CLONE_CMD_SSH}"${GHBU_ORG}/${REPOURI}.git" "${DIRNAME}" \
|| { # Errors were seen above, so no GHBU_SILENT here:
echo "..... Attempt a retry over HTTPS" >&2
# FIXME: Effectively we craft the clone_url
# here, rather than using one from metadata
${GHBU_GIT_CLONE_CMD} "https://${GHBU_GITHOST}/${GHBU_ORG}/${REPOURI}" "${DIRNAME}" \
&& echo "..... Attempt a retry over HTTPS: SUCCEEDED" >&2 ; \
} || return
;;
esac
fi
# Return a success either way here:
$GHBU_SILENT || echo "+++ Received $REPOURI into $DIRNAME"
)
function filter_user_org {
# Might be better off getting a "clone_url" here, but so far our
# directory naming etc. rely on the "REPONAME" value received here:
check grep '^ "name"' | check awk -F': "' '{print $2}' | check sed -e 's/",//g'
}
function filter_gist {
check sed -n 's/.*git_pull_url": "\(.*\)",/\1/p'
}
function filter_gist_comments {
check sed -n 's/.*comments_url": "\(.*\)",/\1/p'
}
function get_multipage_file {
# Uses caller envvars FILENAME, APIURL, APIQUERY_SUFFIX, ENTRYID_REGEX
# Returns MULTIPAGE_NUM, MULTIPAGE_OK, ENTRY_COUNT
# Assumes GNU date or compatible - try to avoid extra traffic for us
# and load for servers (and REST API quota hits), by asking GitHub
# REST API directly: "If-Modified-Since" we last fetched the document:
[ -s "$FILENAME" ] && FILEDATE="`date -R -u -r "$FILENAME" | sed 's,\+0000,GMT,'`" || FILEDATE=""
ETAGS="`dirname "$FILENAME"`/etags.cache"
[ -s "$ETAGS" ] && FILEETAG="`grep -E "\t$APIURL\$" < "$ETAGS" | awk '{print $1}'`" || FILEETAG=""
MULTIPAGE_NUM=1
MULTIPAGE_OK=true
ENTRY_COUNT=0
while : ; do
# ex. APIURL="${GHBU_API}/repos/${GHBU_ORG}/${REPO}/issues"
# ex. APIQUERY_SUFFIX="&state=all"
# Also note: https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#user-agent-required
# hence `the User-Agent: username` however it causes JSON
# markup not prettified for humans (and line-based grep)
# Ordering defaults to showing newest issues first
# TODO: Can we impact ordering of comments so newest
# changed would be first and we know from the first page
# that we need to re-fetch this?..
CURLRES=0
rm -f "${FILENAME}.headers" || true
WANT_HEADER=""
if [ 1 = "$MULTIPAGE_NUM" ] && ( [ -n "$FILEETAG" ] || [ -n "$FILEDATE" ] ) ; then
WANT_HEADER="yes"
fi
curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" \
-H "User-Agent: ${GHBU_UNAME}" \
${FILEETAG+-H "If-None-Match: ${FILEETAG}"} \
${FILEDATE+-H "If-Modified-Since: ${FILEDATE}"} \
${WANT_HEADER+-D "${FILENAME}.headers"} \
"${APIURL}?per_page=100&page=${MULTIPAGE_NUM}${APIQUERY_SUFFIX}" -q \
> "${FILENAME}.__WRITING__.tmp" \
|| {
echo "FAILED to fetch '${APIURL}' once: will sleep in case it is about the usage quota and try again"
sleep 120
check curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" \
-H "User-Agent: ${GHBU_UNAME}" \
${FILEETAG+-H "If-None-Match: ${FILEETAG}"} \
${FILEDATE+-H "If-Modified-Since: ${FILEDATE}"} \
${WANT_HEADER+-D "${FILENAME}.headers"} \
"${APIURL}?per_page=100&page=${MULTIPAGE_NUM}${APIQUERY_SUFFIX}" -q \
> "${FILENAME}.__WRITING__.tmp"
} || CURLRES=$?
# NOTE: Value may include quotes; header is posted with them!
NEWETAG="`grep -i '^etag:' "${FILENAME}.headers" | sed 's,^[Ee][Tt][Aa][Gg]: *,,' | tr -d '\r' | tr -d '\n'`" || NEWETAG=""
if [ -n "$NEWETAG" ] && [ x"$NEWETAG" != x"$FILEETAG" ] ; then
printf '%s\t%s\n' "$NEWETAG" "$APIURL" > "$ETAGS.tmp"
if [ -s "$ETAGS" ]; then grep -v "$APIURL" "$ETAGS" >> "$ETAGS.tmp" ; fi
mv "$ETAGS.tmp" "$ETAGS"
fi
if head -1 "${FILENAME}.headers" | grep -E "HTTP.*304" > /dev/null ; then
# Let HTTP-304 skips be seen
$GHBU_SILENT || echo "SKIP: (First page of) the requested resource did not change"
rm -f "${FILENAME}.headers" "${FILENAME}.__WRITING__.tmp"
MULTIPAGE_OK=true
ENTRY_COUNT="`grep -Ec "$ENTRYID_REGEX" < "${FILENAME}"`"
break
fi
# Only ask for first page
unset FILEDATE
unset FILEETAG
if [ $CURLRES != 0 ] ; then
MULTIPAGE_OK=false
break
fi
if [ 1 = "$MULTIPAGE_NUM" ] ; then
# Not cached => replace
rm -f "$FILENAME" || true
fi
# Produce pretty JSON we can grep in, and otherwise manipulate below:
jq < "${FILENAME}.__WRITING__.tmp" > "${FILENAME}.__WRITING__"
rm -f "${FILENAME}.__WRITING__.tmp"
if [ 1024 > "`wc -c < "${FILENAME}.__WRITING__"`" ] \
&& grep -q '"message": "API rate limit exceeded' "${FILENAME}.__WRITING__" \
; then
echo "FAILED to fetch '${APIURL}': got some contents but they are short and only say that API rate limit exceeded"
MULTIPAGE_OK=false
break
fi
# ex. ENTRYID_REGEX='"url": "'"${GHBU_API}/repos/${GHBU_ORG}/${REPO}/issues/[0123456789]+"'"'
NUM="`grep -Ec "$ENTRYID_REGEX" < "${FILENAME}.__WRITING__"`"
ENTRY_COUNT="`expr $ENTRY_COUNT + $NUM`"
if [ ! -e "${FILENAME}" ] ; then
check mv -f "${FILENAME}.__WRITING__" "${FILENAME}"
else
(head -n -2 "${FILENAME}" && \
echo " }," && \
tail -n +2 "${FILENAME}.__WRITING__"
) > "${FILENAME}.__WRITING__.tmp" \
|| { MULTIPAGE_OK=false; break; }
mv -f "${FILENAME}.__WRITING__.tmp" "${FILENAME}"
rm -f "${FILENAME}.__WRITING__"
fi
if [ $NUM -lt 100 ] ; then
# Last page
break
fi
MULTIPAGE_NUM="`expr $MULTIPAGE_NUM + 1`"
done
rm -f "${FILENAME}.headers"
# Return status
$MULTIPAGE_OK
}
check command -v jq
$GHBU_SILENT || (echo "" && echo "=== INITIALIZING ===" && echo "")
$GHBU_SILENT || echo "Using backup directory $GHBU_BACKUP_DIR"
check mkdir -p $GHBU_BACKUP_DIR
prune_incomplete
$GHBU_SILENT || echo -n "Fetching list of repositories for ${GHBU_ORG}..."
case x"$GHBU_ORGMODE" in
x"gist"|x"gists")
GHBU_ORG_URI="/gists"
;;
x"user"|x"users")
# NOTE: if you're backing up a *user's* repos, not an organizations, use this instead:
if [ "${GHBU_ORG}" = "${GHBU_UNAME}" ] ; then
# Backing up ourselves
GHBU_ORG_URI="/user"
else
# Backing up a friend/alter-ego (using our login)
GHBU_ORG_URI="/users/${GHBU_ORG}"
fi
;;
x"org"|*) # Legacy default
GHBU_ORG_URI="/orgs/${GHBU_ORG}"
;;
esac
# Be sure to stash a fresh copy, even if previous backup was interrupted
rm -f "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json.__WRITING__"
touch "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json.__WRITING__"
GIST_COMMENTLIST=""
GIST_COMMENTLIST_PAGE=""
REPOLIST=""
REPOLIST_PAGE=""
PAGENUM=1
# TODO: Convert to get_multipage_file()?
while : ; do
JSON=""
case x"$GHBU_ORGMODE" in
xorg*|xuser*)
# hat tip to https://gist.github.com/rodw/3073987#gistcomment-3217943 for the license name workaround
# The "type=owner" should be default per https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repositories-for-a-user
# but with a powerful token a "user" backup may see all repos
# one has access to (collaborating in other orgs). Other than
# lots of noise and more time to get the listing, this leads
# to broken backup cycles when we try to fetch repo names that
# are not known under this user's personal namespace.
# Note: user-agent causes compacted JSON we can not grep in => jq
JSON="$(curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" -H "User-Agent: ${GHBU_UNAME}" "${GHBU_API}${GHBU_ORG_URI}/repos?per_page=100&page=$PAGENUM&type=owner" -q)"
check [ "$?" = 0 ]
JSON="$(echo "$JSON" | check jq)"
echo "$JSON" | grep "API rate limit" && check false
REPOLIST_PAGE="$(echo "$JSON" | filter_user_org)"
;;
xgist*)
JSON="$(curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" -H "User-Agent: ${GHBU_UNAME}" "${GHBU_API}${GHBU_ORG_URI}?per_page=100&page=$PAGENUM" -q)"
check [ "$?" = 0 ]
JSON="$(echo "$JSON" | check jq)"
echo "$JSON" | grep "API rate limit" && check false
REPOLIST_PAGE="$(echo "$JSON" | filter_gist)"
GIST_COMMENTLIST_PAGE="$(echo "$JSON" | filter_gist_comments)"
;;
esac
# Not exactly a valid JSON file (gets lists just concatenated), but
# so far good enough to grep it, and some parsers treat it well too
echo "$JSON" >> "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json.__WRITING__"
if [ -z "$REPOLIST" ] ; then
REPOLIST="$REPOLIST_PAGE"
else
REPOLIST="$REPOLIST
$REPOLIST_PAGE"
fi
if [ -z "$GIST_COMMENTLIST" ] ; then
GIST_COMMENTLIST="$GIST_COMMENTLIST_PAGE"
else
GIST_COMMENTLIST="$GIST_COMMENTLIST
$GIST_COMMENTLIST_PAGE"
fi
if [ 100 -ne `echo $REPOLIST_PAGE | wc -w` ] ; then
break
fi
PAGENUM=$(($PAGENUM+1))
$GHBU_SILENT || echo -n " Fetching next page of repos: $PAGENUM..."
done
REPOS_COUNT=0
REPOS_TOTAL="`echo $REPOLIST | wc -w`"
$GHBU_SILENT || echo " found $REPOS_TOTAL repositories."
mv -f "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json.__WRITING__" "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json"
tgz_nonrepo "${GHBU_BACKUP_DIR}/${GHBU_ORG}-metadata.json"
$GHBU_SILENT || (echo "" && echo "=== BACKING UP ===" && echo "")
for REPO in $REPOLIST; do
REPOS_COUNT=$(($REPOS_COUNT+1))
$GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO} ($REPOS_COUNT of $REPOS_TOTAL)"
DIRNAME="`getdir "$REPO"`"
{ getgit "${REPO}" "${DIRNAME}" || {
echo "FAILED to getgit '${REPO}' '${DIRNAME}': will sleep in case it is about the usage quota and try again"
sleep 120
if [ x"$GHBU_FAILFAST_GETGIT" = xtrue ]; then
echo "RETRY getgit '${REPO}' '${DIRNAME}': failure now WILL BE FATAL"
check getgit "${REPO}" "${DIRNAME}"
else
echo "RETRY getgit '${REPO}' '${DIRNAME}': failure now will NOT be fatal"
getgit "${REPO}" "${DIRNAME}"
fi
} ; } && tgz "${DIRNAME}"
# No wikis nor issues for gists; but there are comments (see another loop)
case x"$GHBU_ORGMODE" in
xorg*|xuser*)
$GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO}.wiki (if any)"
DIRNAME="`getdir "$REPO.wiki"`"
# Failure is an option for wikis:
getgit "${REPO}.wiki" "${DIRNAME}" 2>/dev/null && tgz "${DIRNAME}"
# NOTE: While internally issues and PRs seem to be the same,
# at least using the same numbering and responding with them
# to either URL (can discern by "html_url"), they are both
# listed via both numbers: all entities are in "issues" but
# only PRs (with partially different metadata) are in "pulls".
DIRNAME="`getdir "$REPO.issues-and-pulls"`"
$GHBU_SILENT || echo "Preparing local git repo in '${DIRNAME}' to receive issue and PR data"
if [ -d "${DIRNAME}/.git" ]; then
( cd "${DIRNAME}" && git checkout -f
case $? in
0) git clean -fffdddxxx ;;
128) GITOUT="`git log --oneline -1 2>&1`"
# `ls` => empty etags.cache list-issues.json list-pulls.json
if [ $? = 128 ] && ( echo "${GITOUT}" | grep -E 'fatal: (your current branch .* does not have any commits yet|You are on a branch yet to be born)' ) && [ `ls -1 | wc -l` = 3 ] ; then
$GHBU_SILENT || echo "Removing botched earlier preparation of a local git repo in '${DIRNAME}' to receive issue and PR data"
# rm -rf .git
BASE_DIRNAME="`basename "${DIRNAME}"`"
cd .. && rm -rf "${BASE_DIRNAME}"
else
exit 128
fi
;;
esac
) || check [ "$?" = 0 ]
fi
if [ ! -d "${DIRNAME}/.git" ]; then
check mkdir -p "${DIRNAME}" \
&& ( cd "${DIRNAME}" && git init \
&& { git config gc.autodetach false ; git config commit.gpgsign false ; true ; } \
&& touch list-issues.json list-pulls.json etags.cache \
&& git add list-issues.json list-pulls.json etags.cache \
&& git commit -m 'Initial commit' \
) || check [ "$?" = 0 ]
fi
# List of issues:
$GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO} issues"
ISSUES_FILENAME="${DIRNAME}/list-issues.json"
FILENAME="${ISSUES_FILENAME}" \
APIURL="${GHBU_API}/repos/${GHBU_ORG}/${REPO}/issues" \
APIQUERY_SUFFIX="&state=all" \
ENTRYID_REGEX='"url": *"'"${GHBU_API}/repos/${GHBU_ORG}/${REPO}/issues/[0123456789]+"'"' \
get_multipage_file
$GHBU_SILENT || echo "Collected ${ENTRY_COUNT} issues in ${MULTIPAGE_NUM} pages for ${GHBU_ORG}/${REPO}; overall success: ${MULTIPAGE_OK}"
$MULTIPAGE_OK && ( cd "${DIRNAME}" && git add "`basename "$ISSUES_FILENAME"`" ) || MULTIPAGE_OK=false
ISSUES_OK="$MULTIPAGE_OK"
ISSUES_NUM="${ENTRY_COUNT}"
# List of PRs:
$GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO} pull requests"
PULLS_FILENAME="${DIRNAME}/list-pulls.json"
FILENAME="${PULLS_FILENAME}" \
APIURL="${GHBU_API}/repos/${GHBU_ORG}/${REPO}/pulls" \
APIQUERY_SUFFIX="&state=all" \
ENTRYID_REGEX='"url": *"'"${GHBU_API}/repos/${GHBU_ORG}/${REPO}/pulls?/[0123456789]+"'"' \
get_multipage_file
$GHBU_SILENT || echo "Collected ${ENTRY_COUNT} pull requests in ${MULTIPAGE_NUM} pages for ${GHBU_ORG}/${REPO}; overall success: ${MULTIPAGE_OK}"
$MULTIPAGE_OK && ( cd "${DIRNAME}" && git add "`basename "$PULLS_FILENAME"`" ) || MULTIPAGE_OK=false
PULLS_OK="$MULTIPAGE_OK"
PULLS_NUM="${ENTRY_COUNT}"
# Contents
( $ISSUES_OK && grep '"comments_url"' "$ISSUES_FILENAME" || true
$PULLS_OK && grep -E '"(comments_url|review_comments_url|commits_url)"' "$PULLS_FILENAME" || true
) | awk '{print $NF}' | sed -e 's,^",,' -e 's/",*$//' | sort -n | uniq | \
while IFS= read SUB_URL \
; do
SUB_FILENAME="`echo "$SUB_URL" | sed -e "s,^${GHBU_API}/repos/${GHBU_ORG}/${REPO}/,," -e 's,[:/],-,g'`.json"
# Skip user metadata
case "$SUB_FILENAME" in
http---*|https---*|*"{-"*"}"*) continue ;;
esac
$GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${REPO} issue or pull request details from: ${SUB_URL}"
FILENAME="${DIRNAME}/${SUB_FILENAME}" APIURL="${SUB_URL}" \
ENTRYID_REGEX='("sha": *"[0-9a-f]{40}"|"url": *"'"${GHBU_API}/repos/${GHBU_ORG}/${REPO}/(issues|pulls)/comments/[0-9]+"'")' \
get_multipage_file \
&& ( cd "${DIRNAME}" && git add "${SUB_FILENAME}" )
done
$PULLS_OK && $ISSUES_OK \
&& (
GITMSG="Update due to backup at `LANG=C LC_ALL=C TZ=UTC date -u`"
cd "${DIRNAME}" && \
git add etags.cache && \
{ git commit -m "${GITMSG}" || { echo "Retry git commit without GPG" >&2 ; git commit --no-gpg-sign -m "${GITMSG}" ; } ; }
) && tgz "${DIRNAME}"
;;
esac
done
# Assumes GHBU_ORGMODE=gist, but no reason to constrain:
COMMENT_COUNT=0
COMMENT_TOTAL="`echo $GIST_COMMENTLIST | wc -w`"
for COMMENT_URL in $GIST_COMMENTLIST; do
COMMENT_COUNT=$(($COMMENT_COUNT+1))
$GHBU_SILENT || echo "Backing up ${GHBU_ORG}/${COMMENT_URL} comments ($COMMENT_COUNT of $COMMENT_TOTAL)"
FILENAME="`getdir "${COMMENT_URL}.comments" | sed 's,.git$,,'`"
check curl --silent -u "${GHBU_UNAME}:${GHBU_PASSWD}" \
-H "User-Agent: ${GHBU_UNAME}" \
"${COMMENT_URL}" -q \
> "${FILENAME}.__WRITING__" \
&& mv -f "${FILENAME}.__WRITING__" "${FILENAME}" \
&& tgz_nonrepo "${FILENAME}"
done
# NOTE: the "latest" and optional "prev" handling below allows us to leave at
# least one (better two) backup tarballs for each timestamped item sequence.
# GitHub going AWOL and us deleting all backups after 3 days would be folly!
# (Less of a problem if we do keep the repos, but comments/issues/medatata
# are still at risk - maybe GIT their evolution locally?)
# NOTE: according to `man find` (GNU, comments to `-atime` et al handling),
# the fractional parts of "n*24 hours" are ignored, "so to match -atime +1,
# a file has to have been accessed at least two days ago".
# This way, GHBU_PRUNE_AFTER_N_DAYS=0 only chops files older than 24 hours.
if $GHBU_PRUNE_OLD && [ "${GHBU_PRUNE_AFTER_N_DAYS}" -ge 0 ]; then
$GHBU_SILENT || (echo "" && echo "=== PRUNING ===" && echo "")
$GHBU_SILENT || echo "Pruning backup files ${GHBU_PRUNE_AFTER_N_DAYS} days old or older."
$GHBU_SILENT || echo "Found `find $GHBU_BACKUP_DIR -maxdepth 1 -name '*.tar.gz' -a \! -name '*.prev.tar.gz' -a \! -name '*.latest.tar.gz' -mtime +${GHBU_PRUNE_AFTER_N_DAYS} | wc -l` files to prune."
find $GHBU_BACKUP_DIR -maxdepth 1 -name '*.tar.gz' -a \! -name '*.prev.tar.gz' -a \! -name '*.latest.tar.gz' -mtime "+${GHBU_PRUNE_AFTER_N_DAYS}" -exec rm -fv {} > /dev/null \;
$GHBU_SILENT || (echo "" && echo "=== PRUNING FINISHED ===" && echo "")
fi
prune_incomplete
$GHBU_SILENT || (echo "" && echo "=== DONE ===" && echo "")
$GHBU_SILENT || (echo "GitHub backup for ${GHBU_ORG} (${GHBU_ORGMODE}) completed." && echo "")