forked from wikimedia/mediawiki-tools-grabbers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrabImages.php
126 lines (106 loc) · 3.44 KB
/
grabImages.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
<?php
/**
* Maintenance script to grab images from a wiki and import them to another
* wiki.
* Translated from Misza's python version.
*
* @file
* @ingroup Maintenance
* @author Jack Phoenix <[email protected]>
* @author Jesús Martínez <[email protected]>
* @version 1.0
* @date 11 April 2021
*/
require_once 'includes/FileGrabber.php';
class GrabImages extends FileGrabber {
public function __construct() {
parent::__construct();
$this->mDescription = 'Get images from an external wiki and save them to our disk. ' .
'This script does not import them to the wiki. If you want files to be imported, use grabFiles instead.';
$this->addOption( 'folder', 'Folder to save images to', true /* required? */, true /* withArg */ );
$this->addOption( 'from', 'Name of file to start from', false /* required? */, true /* withArg */ );
}
/**
* The function to grab images from a specified URL
*/
public function execute() {
parent::execute();
$folder = $this->getOption( 'folder' );
if ( !file_exists( $folder ) ) {
$this->fatalError( "Output folder doesn't exist: {$folder}" );
return false;
}
$this->output( "The directory where images will be stored in is: {$folder}\n" );
$params = [
'generator' => 'allimages',
'gailimit' => 'max',
'prop' => 'imageinfo',
'iiprop' => 'url|sha1',
'iilimit' => '1'
];
$gaifrom = $this->getOption( 'from' );
if ( $gaifrom !== null ) {
$params['gaifrom'] = $gaifrom;
}
$more = true;
$count = 0;
while ( $more ) {
$result = $this->bot->query( $params );
if ( empty( $result['query']['pages'] ) ) {
$this->fatalError( 'No files found...' );
}
foreach ( $result['query']['pages'] as $file ) {
$count = $count + $this->processFile( $file, $folder );
}
if ( isset( $result['query-continue'] ) ) {
$gaifrom = $result['query-continue']['allimages']['gaifrom'];
} elseif ( isset( $result['continue'] ) ) {
$params = array_merge( $params, $result['continue'] );
} else {
$more = false;
}
}
$this->output( "$count files downloaded.\n" );
}
/**
* Downloads the image returned by the api
*
* @param array $entry Page data returned from the api with imageinfo
* @param string $folder Folder to save the file to
* @return int 1 if image has been downloaded, 0 otherwise
*/
function processFile( $entry, $folder ) {
$name = $this->sanitiseTitle( $entry['ns'], $entry['title'] );
$count = 0;
// We're getting only one file revision (the latest one)
foreach ( $entry['imageinfo'] as $fileVersion ) {
# Check for Wikia's videos
if ( $this->isWikiaVideo( $fileVersion ) ) {
$this->output( "...File {$name} appears to be a video, skipping it.\n" );
return 0;
}
if ( !isset( $fileVersion['url'] ) ) {
# If the file is supressed and we don't have permissions,
# we won't get URL nor MIME.
# Skip the file revision instead of crashing
$this->output( "...File {$name} supressed, skipping it\n" );
return 0;
}
$url = $this->sanitiseUrl( $fileVersion['url'] );
$sha1 = Wikimedia\base_convert( $fileVersion['sha1'], 16, 36, 31 );
$path = "$folder/$name";
$status = $this->downloadFile( $url, $path, $sha1 );
if ( $status->isOK() ) {
$count++;
}
}
if ( $count == 1 ) {
$this->output( "Store {$url} as {$path} OK\n" );
} else {
$this->output( "Store {$url} as {$path} FAILED\n" );
}
return $count;
}
}
$maintClass = 'GrabImages';
require_once RUN_MAINTENANCE_IF_MAIN;