You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

459 lines
20 KiB

  1. #!/usr/bin/perl
  2. # Project: Web Reference Database (refbase) <http://www.refbase.net>
  3. # Copyright: Matthias Steffens <mailto:refbase@extracts.de> and the file's
  4. # original author(s).
  5. #
  6. # This code is distributed in the hope that it will be useful,
  7. # but WITHOUT ANY WARRANTY. Please see the GNU General Public
  8. # License for more details.
  9. #
  10. # File: ./contrib/command_line/refbase_import
  11. # Repository: $HeadURL: file:///svn/p/refbase/code/branches/bleeding-edge/contrib/command_line/refbase_import $
  12. # Author(s): Matthias Steffens <mailto:refbase@extracts.de>
  13. #
  14. # Created: 06-Jun-06, 18:00
  15. # Modified: $Date: 2008-10-30 18:16:54 +0000 (Thu, 30 Oct 2008) $
  16. # $Author: msteffens $
  17. # $Revision: 1289 $
  18. # REFBASE_IMPORT -- a refbase command line interface
  19. # Purpose: Perl script that allows to upload data in various formats to a refbase online database from the command line
  20. # Usage: refbase_import [OPTIONS] [FILE]
  21. # Help: For help with the syntax type 'refbase_import -h'
  22. # To view some usage examples type 'refbase_import -X'
  23. # Further information is available at <http://cli.refbase.net/>
  24. # A list of supported import formats is given at <http://import.refbase.net/>
  25. # Version: 1.2
  26. # Requires: - a shell with Perl execution capabilities
  27. # - the Perl CPAN modules LWP::UserAgent, HTTP::Request::Common, HTTP::Response, HTTP::Cookies and URI::URL
  28. # - access with import permissions to a refbase database (refbase-0.9.0 or greater)
  29. # Limits: - The character encoding of your import data must match the encoding of your refbase database (i.e., 'latin1' or 'utf8')
  30. # - The authentication mechanism is currently limited in that a given password will be transferred as parameter in the POST request
  31. # --------------------------------------------------------------------------------------------------------------
  32. $version = "1.2";
  33. # Configure variables:
  34. # Specify the full URLs to any refbase servers that shall be queried:
  35. # Notes: - the given hash keys will work as shortcuts, e.g. '--host=local' would upload
  36. # data to your local refbase installation; one hash key must be named 'default',
  37. # all other keys can be freely chosen
  38. # - by default, data will be uploaded to the server labeled with key 'default'
  39. %hosts = (
  40. 'default' => 'http://beta.refbase.net/',
  41. 'local' => 'http://localhost/refs/',
  42. 'beta' => 'http://beta.refbase.net/',
  43. 'beta2' => 'http://refbase.textdriven.com/beta/',
  44. 'demo' => 'http://demo.refbase.net/',
  45. 'org' => 'http://www.refbase.org/'
  46. );
  47. # Specify the default values for all options that are not explicitly specified:
  48. %params = (
  49. # import options:
  50. 'skipBadRecords' => '0', # -b|--skipbad -> must be '0' (don't skip records with unrecognized data format) or '1' (skip records with unrecognized data format)
  51. 'importRecordsRadio' => 'all', # -i|--import -> must be 'all' (import all records) or 'only' (import only those records specified in 'importRecords')
  52. 'sourceIDs' => '', # -p|--pmid -> this also applies for '--arxiv|--doi|--openurl' since they are essentially just aliases for '-p'
  53. 'importRecords' => '1', # -r|--records -> must be a list of numbers and/or ranges (e.g., '1-5' will import the first five records; '1 3-5 7' will import records 1, 3, 4, 5 and 7)
  54. 'formType' => 'data', # -t|--type -> must be 'data' (generic data import) or 'id' (import via ID)
  55. # fixed parameters:
  56. 'client' => "cli-refbase_import-" . $version # the client ID of this command line utility
  57. );
  58. %outputParams = (
  59. # output options:
  60. 'citeStyle' => '', # -C|--style => desired citation style, given name must match an entry within the database's MySQL table 'styles' (keep empty to use the database default)
  61. 'format' => 'ascii', # -F|--format => output format must be 'html', 'rtf', 'pdf', 'latex', 'latex_bbl', 'markdown', 'ascii', 'ads', 'bibtex', 'endnote', 'isi', 'ris', 'atom', 'mods', 'oai_dc', 'odf', 'srw_dc', 'srw_mods', 'word' or '' (the empty string '' will produce the default 'ascii' output style)
  62. 'showLinks' => '1', # -L|--showlinks => hide/display links column in HTML output; must be '0', '1', or '' (the empty string '' will produce the default output style, i.e. print any links)
  63. 'citeOrder' => 'author', # -O|--order => cite order must be 'author', 'year', 'type', 'type-year', 'creation-date' or '' (the empty string '' will produce the default 'author' sort order)
  64. 'viewType' => 'web' # -V|--view => view type of HTML output; must be 'Web', 'Print', 'Mobile' or '' (the empty string '' will produce the default 'Web' output style)
  65. );
  66. # Specify the default login credentials for a refbase user account:
  67. # Imported data will get associated with this user account
  68. %loginParams = (
  69. 'loginEmail' => '', # -U|--user -> the login email address of an existing refbase user with import permissions
  70. 'loginPassword' => '' # -P|--password -> the password for the given user account
  71. );
  72. # Specify the location of the cookie jar file:
  73. # This file will be used to store & retrieve cookies
  74. $cookieJarFile = "$ENV{HOME}/.lwpcookies.txt";
  75. # --------------------------------------------------------------------------------
  76. # NOTE: You shouldn't need to change anything below this line
  77. # CPAN modules:
  78. use LWP::UserAgent; # more info: <http://search.cpan.org/~gaas/libwww-perl-5.805/lib/LWP/UserAgent.pm>
  79. use HTTP::Request::Common; # more info: <http://search.cpan.org/~gaas/libwww-perl-5.805/lib/HTTP/Request/Common.pm>
  80. use HTTP::Response; # more info: <http://search.cpan.org/~gaas/libwww-perl-5.805/lib/HTTP/Response.pm>
  81. use HTTP::Cookies; # more info: <http://search.cpan.org/~gaas/libwww-perl-5.805/lib/HTTP/Cookies.pm>
  82. use URI::URL; # more info: <http://search.cpan.org/~gaas/URI-1.35/URI/URL.pm>
  83. # initialize variables:
  84. $host = $hosts{'default'};
  85. $format = '';
  86. # Extract options:
  87. # TODO: use Getopt::Long
  88. # general options:
  89. if (($ARGV[0] eq '--help') or ($ARGV[0] eq '-h') or ($ARGV[0] eq '')) { &usage (0); } # if the user asked for --help/-h or didn't provide any input, call the 'usage' subroutine
  90. elsif (($ARGV[0] eq '--version') or ($ARGV[0] eq '-v')) { &version (0); } # show version information
  91. elsif (($ARGV[0] eq '--examples') or ($ARGV[0] eq '-X')) { &examples (0); } # print some usage examples
  92. else {
  93. foreach (@ARGV) {
  94. # extract import options:
  95. if ($_ =~ /^(?:-b|--skipbad)=(.+)$/) { $params{'skipBadRecords'} = $1; }
  96. elsif ($_ =~ /^(?:-i|--import)=(.+)$/) { $params{'importRecordsRadio'} = $1; }
  97. elsif ($_ =~ /^(?:-p|--pmid|--arxiv|--doi|--openurl)=(.+)$/) { $params{'sourceIDs'} = $1; }
  98. elsif ($_ =~ /^(?:-r|--records)=(.+)$/) { $params{'importRecords'} = $1; }
  99. elsif ($_ =~ /^(?:-t|--type)=(.+)$/) { $params{'formType'} = $1; }
  100. # extract output options:
  101. elsif ($_ =~ /^(?:-C|--style)=(.+)$/) { $outputParams{'citeStyle'} = $1; }
  102. elsif ($_ =~ /^(?:-F|--format)=(.+)$/) { $outputParams{'format'} = $1; }
  103. elsif ($_ =~ /^(?:-L|--showlinks)=(.+)$/) { $outputParams{'showLinks'} = $1; }
  104. elsif ($_ =~ /^(?:-O|--order)=(.+)$/) { $outputParams{'citeOrder'} = $1; }
  105. elsif ($_ =~ /^(?:-V|--view)=(.+)$/) { $outputParams{'viewType'} = $1; }
  106. # extract server options:
  107. elsif ($_ =~ /^(?:-H|--host)=(.+)$/) { $host = $1; }
  108. elsif ($_ =~ /^(?:-P|--password)=(.+)$/) { $loginParams{'loginPassword'} = $1; }
  109. elsif ($_ =~ /^(?:-U|--user)=(.+)$/) { $loginParams{'loginEmail'} = $1; }
  110. # extract file:
  111. # (note that if multiple files were given, only the last given file will be honoured)
  112. elsif ($_ =~ /^(?!(-[biprtCFLOVHPU]|--(?:skipbad|import|pmid|arxiv|doi|openurl|records|type|style|format|showlinks|order|view|host|password|user))=)([^ ]+)/) { @sourceFile = $2; }
  113. }
  114. }
  115. # for '--type=data', check if a source file was specified:
  116. if (($params{'formType'} =~ /^data$/i) && (scalar @sourceFile == 0)) {
  117. print "There were validation errors regarding the data you submitted:\n\n";
  118. print "FILE: The file operand is missing! The generic data import feature ('--type=data')\n"
  119. . " requires a FILE to be specified. Type 'refbase_import -X' to see some usage\n"
  120. . " examples. For general help with the syntax type 'refbase_import -h'.\n\n";
  121. exit;
  122. }
  123. # for '--type=id' (or, previously: --type=pmid), check if at least one PubMed ID, arXiv ID, DOI or OpenURL was given:
  124. # TODO: improve identification/verification of the given IDs
  125. elsif (($params{'formType'} =~ /^(pm)?id$/i) && ($params{'sourceIDs'} !~ /\d+/)) {
  126. print "There were validation errors regarding the data you submitted:\n\n";
  127. print "sourceIDs: You must specify at least one PubMed ID, arXiv ID, DOI or OpenURL! The 'import via ID'\n"
  128. . " feature ('--type=id') requires the '-p, --pmid' option or one of '--arxiv|--doi|--openurl'\n"
  129. . " to be specified. Type 'refbase_import -X' to see some usage examples. For general help\n"
  130. . " with the syntax type 'refbase_import -h'.\n\n";
  131. exit;
  132. }
  133. # adjust form type value:
  134. if ($params{'formType'} =~ /^(pm)?id$/i) { # --type=id (or, previously: --type=pmid)
  135. $params{'formType'} = "importID";
  136. }
  137. else { # --type=data
  138. $params{'formType'} = "import";
  139. }
  140. # resolve any host shortcuts:
  141. if (exists($hosts{$host})) {
  142. $host = $hosts{$host};
  143. }
  144. elsif ($host !~ /^https?:\/\//i) {
  145. $host = $hosts{'default'}; # can't resolve given host, reset back to default
  146. }
  147. # assign correct URL params based on the '-F|--format' option:
  148. if (exists($outputParams{'format'})) {
  149. $format = $outputParams{'format'};
  150. if ($format =~ /^(html|rtf|pdf|latex|latex_bbl|markdown|ascii)$/i) {
  151. $outputParams{'submit'} = "Cite";
  152. }
  153. if ($format =~ /^(html|rtf|pdf|latex|latex_bbl|markdown|ascii)$/i) {
  154. $format =~ s/^latex_bbl$/LaTeX .bbl/i;
  155. $outputParams{'citeType'} = $format;
  156. }
  157. elsif ($format =~ /^(ads|bibtex|endnote|isi|ris|atom|mods|oai_dc|odf|srw(_dc|_mods)?|word)$/i) {
  158. $outputParams{'submit'} = "Export";
  159. $outputParams{'exportType'} = "file";
  160. if ($format =~ /^(ads|bibtex|endnote|isi|ris)$/i) {
  161. $outputParams{'exportFormat'} = $format;
  162. }
  163. elsif ($format =~ /^(atom|mods|oai_dc|odf|srw(_dc|_mods)?|word)$/i) {
  164. $outputParams{'exportFormat'} = $format . " xml";
  165. }
  166. }
  167. else {
  168. $outputParams{'citeType'} = "ascii";
  169. }
  170. delete($outputParams{'format'});
  171. }
  172. # construct URL:
  173. # (uses URI::URL)
  174. $importScript = "import_modify.php";
  175. $importURL = url($host . $importScript);
  176. # initialize new user agent:
  177. # (uses LWP::UserAgent)
  178. $userAgent = LWP::UserAgent->new;
  179. # set user agent string:
  180. $userAgent->agent("refbase_import/" . $version . " (http://cli.refbase.net/) ");
  181. # set cookie jar object:
  182. # LWP will collect cookies and respond to cookie requests via its cookie jar, thus
  183. # enabling the user agent to fetch a PHP session ID from the refbase login response
  184. # and automatically resend it upon next import request
  185. $userAgent->cookie_jar({ file => $cookieJarFile, autosave => 1 });
  186. # attempt to authenticate using the given login credentials:
  187. if (($loginParams{'loginEmail'} ne '') && ($loginParams{'loginPassword'} ne '')) {
  188. $loginSuccessful = &login(0); # call the 'login' subroutine
  189. }
  190. else {
  191. $loginSuccessful = 0;
  192. }
  193. if (!$loginSuccessful) {
  194. print "Login failed! You provided an incorrect email address or password.\n\n";
  195. exit;
  196. }
  197. # send POST request:
  198. # (uses HTTP::Request::Common & HTTP::Response)
  199. if ($params{'formType'} =~ /^importID$/i) { # --type=id (or, previously: --type=pmid)
  200. $request = POST $importURL, \%params;
  201. }
  202. else { # --type=data
  203. $params{'uploadFile'} = \@sourceFile;
  204. $request = POST $importURL, Content_Type => 'form-data', Content => \%params;
  205. }
  206. $response = $userAgent->request($request);
  207. if ($response->is_error()) {
  208. print STDERR $response->status_line, "\n";
  209. }
  210. else {
  211. $location = $response->header('Location');
  212. if ($location ne '') {
  213. if ($location =~ /show.php/) {
  214. # display imported records:
  215. foreach $key (keys %outputParams) {
  216. $location .= "&" . $key . "=" . $outputParams{$key};
  217. }
  218. if ($location =~ /&headerMsg=\D*(\d+)/i) {
  219. $location .= "&showRows=" . $1;
  220. }
  221. }
  222. # construct URL:
  223. # (uses URI::URL)
  224. $responseURL = url($host . $location);
  225. # send GET request:
  226. # (uses HTTP::Request::Common & HTTP::Response)
  227. $request = GET $responseURL;
  228. $response = $userAgent->request($request); # or use: $response = $userAgent->get($responseURL);
  229. }
  230. binmode STDOUT;
  231. print $response->content();
  232. }
  233. # --------------------------------------------------------------------------------
  234. # Login with login credentials given in '%loginParams':
  235. sub login
  236. {
  237. local ($status) = @_;
  238. # construct URL:
  239. # (uses URI::URL)
  240. $loginScript = "user_login.php";
  241. $loginURL = url($host . $loginScript);
  242. # send POST request:
  243. # (uses HTTP::Request::Common & HTTP::Response)
  244. $request = POST $loginURL, \%loginParams;
  245. $response = $userAgent->request($request);
  246. if ($response->is_error()) {
  247. print STDERR $response->status_line, "\n";
  248. exit $status;
  249. }
  250. else {
  251. $location = $response->header('Location');
  252. # upon successful login, refbase will redirect to 'index.php'
  253. if ($location =~ /index.php/) {
  254. return 1; # login successful
  255. }
  256. else {
  257. return 0; # login NOT successful
  258. }
  259. }
  260. }
  261. # --------------------------------------------------------------------------------
  262. # Print usage and exit:
  263. sub usage
  264. {
  265. local ($status) = @_;
  266. print "\nrefbase_import command line client, v" . $version . " by Matthias Steffens, http://cli.refbase.net/\n\n"
  267. . "Usage: refbase_import [OPTIONS] [FILE]\n\n"
  268. . "Notes: - Two import modes are supported:\n"
  269. . " 1) '--type=data' requires an import FILE to be specified;\n"
  270. . " for supported import formats, see: http://import.refbase.net/\n"
  271. . " 2) '--type=id' requires the '-p, --pmid' option or one of '--arxiv|--doi|--openurl' with\n"
  272. . " one or more whitespace-delimited PubMed IDs, arXiv IDs, DOIs or OpenURLs, respectively.\n"
  273. . " - Options syntax: [OPTION]=[VALUE], e.g. '-p=16351846' or '--pmid=\"16351846 16783713\"'.\n"
  274. . " - For each option, default values can be specified at the top of the script.\n"
  275. . " Current defaults are given in parentheses.\n\n"
  276. . "General Options: -h, --help - display this help text\n"
  277. . " -v, --version - display version information\n"
  278. . " -X, --examples - display usage examples\n\n"
  279. . "Import Options: -b, --skipbad - skip records with unrecognized data format ('" . $params{'skipBadRecords'} . "')\n"
  280. . " possible values: 0, 1\n"
  281. . " -i, --import - import all or only some records ('" . $params{'importRecordsRadio'} . "')\n"
  282. . " possible values: all, only\n"
  283. . " -p, --pmid, - IDs of records to import ('" . $params{'sourceIDs'} . "')\n"
  284. . " --arxiv, supported IDs: PubMed ID (PMID), arXiv ID, DOI, OpenURL\n"
  285. . " --doi, \n"
  286. . " --openurl \n"
  287. . " -r, --records - positional numbers and/or ranges of records to import ('" . $params{'importRecords'} . "')\n"
  288. . " requires the '--import=only' option\n"
  289. . " -t, --type - import type ('" . $params{'formType'} . "')\n"
  290. . " possible values: data, id\n\n"
  291. . "Output Options: -C, --style - citation style ('" . $outputParams{'citeStyle'} . "')\n"
  292. . " -F, --format - output format ('" . $outputParams{'format'} . "')\n"
  293. . " possible values: html, rtf, pdf, latex, latex_bbl, markdown, ascii,\n"
  294. . " ads, bibtex, endnote, isi, ris, atom, mods, oai_dc,\n"
  295. . " odf, srw_dc, srw_mods, word\n"
  296. . " -L, --showlinks - hide/display links column in html output ('" . $outputParams{'showLinks'} . "')\n"
  297. . " possible values: 0, 1\n"
  298. . " -O, --order - sort order of returned records ('" . $outputParams{'citeOrder'} . "')\n"
  299. . " possible values: author, year, type, type-year, creation-date\n"
  300. . " -V, --view - view type of html output ('" . $outputParams{'viewType'} . "')\n"
  301. . " possible values: web, print, mobile\n\n"
  302. . "Server Options: -H, --host - URL of the refbase database ('" . $host . "')\n"
  303. . " defined shortcuts: " . join(', ', sort keys(%hosts)) . "\n"
  304. . " -P, --password - password for given '-U, --user' account";
  305. if ($loginParams{'loginPassword'} ne '') {
  306. print "\n (a default pwd has been defined)\n";
  307. }
  308. else {
  309. print " ('')\n";
  310. }
  311. print " -U, --user - login email address of an existing refbase user with\n"
  312. . " import permissions ('" . $loginParams{'loginEmail'} . "')\n\n";
  313. exit $status;
  314. }
  315. # --------------------------------------------------------------------------------
  316. # Print version number and exit:
  317. sub version
  318. {
  319. local ($status) = @_;
  320. print "\nrefbase_import command line client, version " . $version
  321. . "\ncheck for updates at http://cli.refbase.net/\n\n";
  322. exit $status;
  323. }
  324. # --------------------------------------------------------------------------------
  325. # Print examples and exit:
  326. sub examples
  327. {
  328. local ($status) = @_;
  329. print <<'END_EXAMPLES';
  330. --------------------------------------------------------------------------------
  331. REFBASE_IMPORT USAGE EXAMPLES:
  332. --------------------------------------------------------------------------------
  333. 1) Import BibTeX records from file 'import.bib' using the defaults defined
  334. within the refbase_import script:
  335. refbase_import import.bib
  336. --------------------------------------------------------------------------------
  337. 2) Import all Endnote records given in file 'import.enw' into your default
  338. refbase database:
  339. refbase_import -t=data -i=all import.enw
  340. --------------------------------------------------------------------------------
  341. 3) Take RIS records from file 'import.ris' but import only the first three as
  342. well as the fifth and the tenth record into your local refbase database:
  343. refbase_import -H=local -t=data -i=only -r=1-3,5,10 import.ris
  344. --------------------------------------------------------------------------------
  345. 4) Import MODS XML records from file 'mods.xml' into the refbase demo database
  346. using the defaults defined within the refbase_import script:
  347. refbase_import -H=http://demo.refbase.net/ -U=user@refbase.net -P=user mods.xml
  348. --------------------------------------------------------------------------------
  349. 5) Fetch two records from PubMed.gov via their PMID (i.e. the unique PubMed
  350. identifier, in this example, records with PMIDs 16351846 and 16783713) and
  351. import them into your local refbase database:
  352. refbase_import -H=local -t=id -p="16351846 16783713"
  353. --------------------------------------------------------------------------------
  354. 6) Fetch two records from CrossRef.org via their DOI (i.e. the unique Document
  355. Object Identifier, in this example, records with DOIs 10.3354/meps251037 and
  356. 10.1103/PhysRev.47.777) and import them into your local refbase database:
  357. refbase_import -H=local -t=id --doi="10.3354/meps251037 10.1103/PhysRev.47.777"
  358. --------------------------------------------------------------------------------
  359. 7) Fetch three records from arXiv.org via their arXiv ID (i.e. the unique arXiv
  360. identifier, in this example, records with arXiv IDs astro-ph/0609768, 0806.1829
  361. and 0802.0204v1) and import them into your local refbase database:
  362. refbase_import -H=local -t=id --arxiv="astro-ph/0609768 0806.1829 0802.0204v1"
  363. --------------------------------------------------------------------------------
  364. END_EXAMPLES
  365. exit $status;
  366. }
  367. __END__