rijks-uploader

Upload Public Domain files from Rijksmuseum.nl to Wikimedia Commons
git clone http://git.hanabi.in/repos/rijks-uploader.git
Log | Files | Refs | README | LICENSE

commit fe05e9966a69342bcfd82b0bf81e7ddcbbaa3999
Author: Agastya Chandrakant <me@hanabi.in>
Date:   Sun, 24 Jan 2021 19:12:43 +0530

refactored code

Diffstat:
A.gitignore | 6++++++
AhelperFns.js | 136+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aindex.js | 19+++++++++++++++++++
Apackage.json | 7+++++++
Autils.js | 34++++++++++++++++++++++++++++++++++
Ayarn.lock | 25+++++++++++++++++++++++++
6 files changed, 227 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,5 @@ +node_modules +.env +data.csv +original-data.csv +*txt +\ No newline at end of file diff --git a/helperFns.js b/helperFns.js @@ -0,0 +1,136 @@ +const fetch = require("node-fetch"); +const Wikiapi = require("wikiapi"); + +const { promisify } = require("util"); +const fs = require("fs"); + +const readFile = promisify(fs.readFile); +const appendFile = promisify(fs.appendFile); + +const { + author, + cats, + errFile, + license, + LR, + params, + statusFile, + API, + PASSWORD, + USERNAME, + WIKI_API, +} = require("./utils.js"); + +function getDate(dating) { + const yearEarly = dating?.yearEarly || ""; + const yearLate = dating?.yearLate || ""; + return `{{other date|~|${yearEarly}|${yearLate}}}`; +} + +function updateContent(pageContent = '', moreCats = []) { + const moreCatsStr = moreCats.reduce((acc, cur) => acc + `[[Category:Uncategorised images of the Rijksmuseum (${cur})]]\n`, ''); + const finalCats = `${cats}\n${moreCatsStr}`; + return ( + pageContent + `\n== {{int:license-header}} ==\n${license}\n${LR}\n\n${finalCats}` + ); +} + +function getDesc(artObject, collectionID) { + return ( + artObject?.description || `Collection ${collectionID} of the Rijksmuseum` + ); +} + +function getTitle(artObject, collectionID) { + let title = artObject?.longTitle || artObject?.title; + if (title) title += " "; + title += `${collectionID} - Rijksmuseum`; + title = title.replace(/\[/g, '').replace(/\]/g, ''); + return title; +} + +function getUploadObj(media_url, filename, date, description, source) { + return { + media_url, + comment: "uploaded using API", + filename, + text: { + author, + date, + description, + source, + }, + // ignorewarnings: 1, + }; +} + +async function extractIdAndUpload(line) { + try { + const [colID, source, ...rest] = line.split(","); + const collectionID = stripBOM(colID); + const RijksAPIURL = `${API}${collectionID}${params}`; + const rijksAPIresult = await fetch(RijksAPIURL); + const jsonRijksAPIresult = await rijksAPIresult.json(); + const { artObject } = jsonRijksAPIresult; + if (!artObject) { + const content = `${collectionID},WRONG_RIJKS_API_RES\n`; + throw new Error(content); + } + const copyrightStatus = artObject?.copyrightHolder; + const { hasImage, webImage } = artObject; + const media_url = webImage?.url; + if (copyrightStatus || !hasImage || !webImage || !media_url) { + const content = `${collectionID},COPYRIGHTED or MISSING IMAGE\n`; + throw new Error(content); + } + const description = getDesc(artObject, collectionID); + const title = getTitle(artObject, collectionID); + const date = getDate(artObject?.dating); + const media = await fetch(media_url); + const media_blob = await media.blob(); + const fileExt = "." + media_blob?.type.split("/")[1]; + if (!fileExt) { + const content = `${collectionID},MISSING EXT\n`; + throw new Error(content); + } + const filename = title + fileExt; + const wiki = new Wikiapi(WIKI_API); + await wiki.login(USERNAME, PASSWORD); + const uploadObj = getUploadObj( + media_url, + filename, + date, + description, + source + ); + await wiki.upload(uploadObj); + const fileTitle = "File:" + filename; + const pageData = await wiki.page(fileTitle); + const pageContent = pageData?.wikitext; + const moreCats = artObject?.objectCollection; + const updatedContent = updateContent(pageContent, moreCats); + await wiki.edit_page(fileTitle, updatedContent); + const content = `${collectionID},DONE\n`; + await appendFile(statusFile, content); + } catch (err) { + const errStr = err.toString(); + let content = errStr; + if(!content.endsWith('\n')) content+="\n"; + await appendFile(errFile, content); + } +} + +function stripBOM(string = '') { + return (string.charCodeAt(0) === 0xFEFF) ? string.slice(1) : string; +} + +module.exports = { + appendFile, + extractIdAndUpload, + getDate, + getDesc, + getTitle, + getUploadObj, + readFile, + updateContent, +}; diff --git a/index.js b/index.js @@ -0,0 +1,19 @@ +const { appendFile, extractIdAndUpload, readFile } = require("./helperFns.js"); +const { dataFile, errFile, statusFile } = require("./utils.js"); + +async function main() { + try { + const CSVData = await readFile(dataFile, { encoding: "utf8" }); + const CSVLines = CSVData.split("\n"); + CSVLines.forEach(async line => await extractIdAndUpload(line)); + } catch (err) { + const errStr = err.toString(); + const content = errStr + "\n"; + await appendFile(errFile, content); + } finally { + const content = `ALL,DONE\n`; + await appendFile(statusFile, content); + } +} + +main(); diff --git a/package.json b/package.json @@ -0,0 +1,7 @@ +{ + "dependencies": { + "dotenv": "^8.2.0", + "node-fetch": "^2.6.1", + "wikiapi": "^1.14.0" + } +} diff --git a/utils.js b/utils.js @@ -0,0 +1,34 @@ +const dotenv = require("dotenv"); +dotenv.config(); +const { API_KEY, USERNAME, PASSWORD } = process.env; + +const path = require("path"); + +const dataFile = path.join(__dirname, "data.csv"); +const errFile = path.join(__dirname, "err.txt"); +const statusFile = path.join(__dirname, "status.txt"); + +const API = "https://www.rijksmuseum.nl/api/en/collection/"; +const params = `?format=json&key=${API_KEY}&culture=en`; + +const WIKI_API = "https://commons.wikimedia.org/w/api.php"; + +const author = "[[w:Rijksmuseum|Rijksmuseum]]"; +const license = `{{cc-zero|Rijksmuseum}}`; +const LR = `{{LicenseReview}}`; +const cats = "[[Category:Media from Rijksmuseum]]\n[[Category:Uncategorized images of the Rijksmuseum]]"; + +module.exports = { + API, + dataFile, + errFile, + params, + PASSWORD, + USERNAME, + WIKI_API, + author, + license, + LR, + cats, + statusFile +}; diff --git a/yarn.lock b/yarn.lock @@ -0,0 +1,25 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +cejs@latest: + version "4.1.1" + resolved "https://registry.yarnpkg.com/cejs/-/cejs-4.1.1.tgz#c3d473cc323cb4d8ca713848a8b0c1fb8e415b39" + integrity sha512-knpiBF8xdoCQzh/nIS4YKJfMLbYFkZ8nY8Rt3ED44WXq9SQ2x3bEIHEDDWS+owCvkt5TGTzFAF+gzMPEQn3IMw== + +dotenv@^8.2.0: + version "8.2.0" + resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-8.2.0.tgz#97e619259ada750eea3e4ea3e26bceea5424b16a" + integrity sha512-8sJ78ElpbDJBHNeBzUbUVLsqKdccaa/BXF1uPTw3GrvQTBgrQrtObr2mUrE38vzYd8cEv+m/JBfDLioYcfXoaw== + +node-fetch@^2.6.1: + version "2.6.1" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052" + integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw== + +wikiapi@^1.14.0: + version "1.14.0" + resolved "https://registry.yarnpkg.com/wikiapi/-/wikiapi-1.14.0.tgz#e690652f979b585639208a4a9928ad4fc438edf8" + integrity sha512-VPEPKGbXp1xGSOqTVFrfPAs9yv6jkmCKFG79hOYNct3SmF3WaLXK9ipp1mrNK+iwB6tHl0TouSlZVB9Qr3uZ/g== + dependencies: + cejs latest