// Copyright: Matthias Steffens and the file's // original author(s). // // This code is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY. Please see the GNU General Public // License for more details. // // File: ./includes/oaidcxml.inc.php // Repository: $HeadURL$ // Author(s): Matthias Steffens // // Created: 05-Mar-08, 21:52 // Modified: $Date: 2017-04-13 02:00:18 +0000 (Thu, 13 Apr 2017) $ // $Author$ // $Revision: 1416 $ // This include file contains functions that'll export records to OAI_DC XML. // Requires ActiveLink PHP XML Package, which is available under the GPL from: // // TODO: I18n // Incorporate some include files: include_once 'includes/webservice.inc.php'; // include functions that are commonly used with the refbase webservices // Import the ActiveLink Packages require_once("classes/include.php"); import("org.active-link.xml.XML"); import("org.active-link.xml.XMLDocument"); // -------------------------------------------------------------------- // Return records as OAI_DC (i.e. simple/unqualified Dublin Core) XML as required // by the Open Archives Initiative Protocol for Metadata Harvesting (OAI-PMH): // // Spec: // Guides: // function oaidcCollection($result) { global $contentTypeCharset; // these variables are defined in 'ini.inc.php' global $convertExportDataToUTF8; global $citeKeysArray; // '$citeKeysArray' is made globally available from // within this function // Individual records are objects and collections of records are strings $oaidcCollectionDoc = new XMLDocument(); if (($convertExportDataToUTF8 == "yes") AND ($contentTypeCharset != "UTF-8")) $oaidcCollectionDoc->setEncoding("UTF-8"); else $oaidcCollectionDoc->setEncoding($contentTypeCharset); $oaidcCollection = new XML("dcCollection"); $oaidcCollection->setTagAttribute("xmlns:oai_dc", "http://www.openarchives.org/OAI/2.0/oai_dc/"); $oaidcCollection->setTagAttribute("xmlns:dc", "http://purl.org/dc/elements/1.1/"); $oaidcCollection->setTagAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); $oaidcCollection->setTagAttribute("xsi:schemaLocation", "http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"); // ---------------------------------------------------------- // Add OAI_DC XML entries: $exportArray = array(); // array for individually exported records $citeKeysArray = array(); // array of cite keys (used to ensure uniqueness of cite keys among all exported records) // Generate the export for each record and push them onto an array: while ($row = @ mysqli_fetch_array($result)) { // Export the current record as OAI_DC XML: $record = oaidcRecord($row, "oai_dc"); if (!empty($record)) // unless the record buffer is empty... array_push($exportArray, $record); // ...add it to an array of exports } // for each of the OAI_DC XML entries in the result set... foreach ($exportArray as $oaidc) $oaidcCollection->addXMLasBranch($oaidc); $oaidcCollectionDoc->setXML($oaidcCollection); $oaidcCollectionString = $oaidcCollectionDoc->getXMLString(); return $oaidcCollectionString; } // -------------------------------------------------------------------- // Generate an OAI_DC (i.e. simple/unqualified Dublin Core) XML record: // (returns an XML object (oaidc) of a single record) // // TODO: - see inline comments labeled with "TODO" function oaidcRecord($row, $metadataPrefix = "oai_dc", $addNameSpaceInfo = true) { global $databaseBaseURL; // these variables are defined in 'ini.inc.php' global $contentTypeCharset; global $fileVisibility; global $fileVisibilityException; global $filesBaseURL; global $convertExportDataToUTF8; global $defaultCiteStyle; global $citeStyle; global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php' // The array '$transtab_refbase_unicode' contains search & replace patterns for conversion from refbase markup to Unicode entities. global $transtab_refbase_unicode; // defined in 'transtab_refbase_unicode.inc.php' // The array '$transtab_refbase_ascii' contains search & replace patterns for conversion from refbase markup to plain text. global $transtab_refbase_ascii; // defined in 'transtab_refbase_ascii.inc.php' // Define inline text markup to generate a plain text citation string: // (to be included within a 'dcterms:bibliographicCitation' element) $markupPatternsArrayPlain = array("bold-prefix" => "", // NOTE: should we rather keep refbase font-shape markup (like _italic_ and **bold**) for plain text output? "bold-suffix" => "", "italic-prefix" => "", "italic-suffix" => "", "underline-prefix" => "", "underline-suffix" => "", "endash" => "-", "emdash" => "-", "ampersand" => "&", "double-quote" => '"', "double-quote-left" => '"', "double-quote-right" => '"', "single-quote" => "'", "single-quote-left" => "'", "single-quote-right" => "'", "less-than" => "<", "greater-than" => ">", "newline" => "\n" ); // This is a stupid hack that maps the names of the '$row' array keys to those used // by the '$formVars' array (which is required by function 'generateCiteKey()') // (eventually, the '$formVars' array should use the MySQL field names as names for its array keys) $formVars = buildFormVarsArray($row); // function 'buildFormVarsArray()' is defined in 'include.inc.php' // Generate or extract the cite key for this record: // (to be included within a 'dc:identifier' element) $citeKey = generateCiteKey($formVars); // function 'generateCiteKey()' is defined in 'include.inc.php' // Generate OpenURL data: // (to be included within a 'dc:identifier' element) $openURL = openURL($row, "openurl:"); // function 'openURL()' is defined in 'openurl.inc.php' // Encode special chars and perform charset conversions: foreach ($row as $rowFieldName => $rowFieldValue) { // We only convert those special chars to entities which are supported by XML: // function 'encodeHTMLspecialchars()' is defined in 'include.inc.php' $row[$rowFieldName] = encodeHTMLspecialchars($row[$rowFieldName]); // Convert field data to UTF-8: // (if '$convertExportDataToUTF8' is set to "yes" in 'ini.inc.php' and character encoding is not UTF-8 already) // (Note that charset conversion can only be done *after* the cite key has been generated, otherwise cite key // generation will produce garbled text!) // function 'convertToCharacterEncoding()' is defined in 'include.inc.php' if (($convertExportDataToUTF8 == "yes") AND ($contentTypeCharset != "UTF-8")) $row[$rowFieldName] = convertToCharacterEncoding("UTF-8", "IGNORE", $row[$rowFieldName]); } // Defines field-specific search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element: // (If you don't want to perform any search and replace actions, specify an empty array, like: '$fieldSpecificSearchReplaceActionsArray = array();'. // Note that the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.) // "/Search Pattern/" => "Replace Pattern" $fieldSpecificSearchReplaceActionsArray = array(); if ($convertExportDataToUTF8 == "yes") $fieldSpecificSearchReplaceActionsArray[] = array('fields' => array("title", "publication", "abbrev_journal", "address", "keywords", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), 'actions' => $transtab_refbase_unicode ); // Apply field-specific search & replace 'actions' to all fields that are listed in the 'fields' element of the arrays contained in '$fieldSpecificSearchReplaceActionsArray': foreach ($fieldSpecificSearchReplaceActionsArray as $fieldActionsArray) foreach ($row as $rowFieldName => $rowFieldValue) if (in_array($rowFieldName, $fieldActionsArray['fields'])) $row[$rowFieldName] = searchReplaceText($fieldActionsArray['actions'], $rowFieldValue, true); // function 'searchReplaceText()' is defined in 'include.inc.php' // Fetch the name of the citation style file that's associated with the style given in '$citeStyle': $citeStyleFile = getStyleFile($citeStyle); // function 'getStyleFile()' is defined in 'include.inc.php' if (empty($citeStyleFile)) { $citeStyle = $defaultCiteStyle; // if the given cite style could not be found, we'll use the default cite style which is defined by the '$defaultCiteStyle' variable in 'ini.inc.php' $citeStyleFile = getStyleFile($citeStyle); } // Include the found citation style file *once*: include_once "cite/" . $citeStyleFile; // Generate a proper citation for this record, ordering attributes according to the chosen output style & record type: // - Plain text version of citation string: $recordCitationPlain = citeRecord($row, $citeStyle, "", $markupPatternsArrayPlain, false); // function 'citeRecord()' is defined in the citation style file given in '$citeStyleFile' (which, in turn, must reside in the 'styles' directory of the refbase root directory) // Convert any refbase markup that remains in the citation string (such as _italic_ or **bold**) to plain text: $recordCitationPlain = searchReplaceText($transtab_refbase_ascii, $recordCitationPlain, true); // Convert any remaining refbase markup in the 'title', 'keywords' & 'abstract' fields to plain text: $row['title'] = searchReplaceText($transtab_refbase_ascii, $row['title'], true); $row['keywords'] = searchReplaceText($transtab_refbase_ascii, $row['keywords'], true); $row['abstract'] = searchReplaceText($transtab_refbase_ascii, $row['abstract'], true); // Strip any " (ed)" or " (eds)" suffix from author/editor string: if (preg_match("/ *\(eds?\)$/", $row['author'])) $row['author'] = preg_replace("/[ \r\n]*\(eds?\)/i", "", $row['author']); if (preg_match("/ *\(eds?\)$/", $row['editor'])) $row['editor'] = preg_replace("/[ \r\n]*\(eds?\)/i", "", $row['editor']); // Include a link to any corresponding file if one of the following conditions is met: // - the variable '$fileVisibility' (defined in 'ini.inc.php') is set to 'everyone' // - the variable '$fileVisibility' is set to 'login' AND the user is logged in // - the variable '$fileVisibility' is set to 'user-specific' AND the 'user_permissions' session variable contains 'allow_download' // - the array variable '$fileVisibilityException' (defined in 'ini.inc.php') contains a pattern (in array element 1) that matches the contents of the field given (in array element 0) // // TODO: - the URL-generating code should be made into a dedicated function (since it's shared with 'modsxml.inc.php' and 'atomxml.inc.php') $printURL = false; if ($fileVisibility == "everyone" OR ($fileVisibility == "login" AND isset($_SESSION['loginEmail'])) OR ($fileVisibility == "user-specific" AND (isset($_SESSION['user_permissions']) AND preg_match("/allow_download/", $_SESSION['user_permissions']))) OR (!empty($fileVisibilityException) AND preg_match($fileVisibilityException[1], $row[$fileVisibilityException[0]]))) { if (!empty($row['file'])) { if (preg_match('#^(https?|ftp|file)://#i', $row['file'])) // if the 'file' field contains a full URL (starting with "http://", "https://", "ftp://", or "file://") { $URLprefix = ""; // we don't alter the URL given in the 'file' field } else // if the 'file' field contains only a partial path (like 'polarbiol/10240001.pdf') or just a file name (like '10240001.pdf') { // use the base URL of the standard files directory as prefix: if (preg_match('#^/#', $filesBaseURL)) // absolute path -> file dir is located outside of refbase root dir $URLprefix = 'http://' . $_SERVER['HTTP_HOST'] . $filesBaseURL; else // relative path -> file dir is located within refbase root dir $URLprefix = $databaseBaseURL . $filesBaseURL; } $printURL = true; } } // ---------------------------------------------------------- // Start OAI_DC XML record: if (!empty($metadataPrefix)) $recordPrefix = $metadataPrefix . ":"; $record = new XML($recordPrefix . "dc"); // create an XML object for a single record if ($addNameSpaceInfo) { if ($metadataPrefix == "oai_dc") $record->setTagAttribute("xmlns:oai_dc", "http://www.openarchives.org/OAI/2.0/oai_dc/"); elseif ($metadataPrefix == "srw_dc") $record->setTagAttribute("xmlns:srw_dc", "info:srw/schema/1/dc-v1.1"); $record->setTagAttribute("xmlns:dc", "http://purl.org/dc/elements/1.1/"); if ($metadataPrefix == "oai_dc") // NOTE: should we include these for 'srw_dc:dc' output as well? { $record->setTagAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); $record->setTagAttribute("xsi:schemaLocation", "http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"); } elseif ($metadataPrefix == "srw_dc") $record->setTagAttribute("xmlns:prism", "http://prismstandard.org/namespaces/1.2/basic/"); } // Add Dublin Core elements: // NOTE: With a few exceptions, we try to adhere to the guidelines given at // "Using simple Dublin Core to describe eprints" by Andy Powell et al. // See: // - 'dc:title': if (!empty($row['title'])) addMetaElement($record, "dc", "title", array(), $row['title']); // function 'addMetaElement()' is defined in 'webservice.inc.php' // - 'dc:creator': if (!empty($row['author']) AND ($row['author'] != $row['editor'])) addMetaElement($record, "dc", "creator", array(), $row['author']); // - 'dc:creator': // TODO: add refbase corporate author(s) as 'dc:creator' // - 'dc:contributor': if (!empty($row['editor'])) addMetaElement($record, "dc", "contributor", array(), $row['editor']); // - 'dc:description': if (!empty($row['abstract'])) addMetaElement($record, "dc", "description", array(), $row['abstract']); // - 'dc:identifier': // - DOI: if (!empty($row['doi'])) addMetaElement($record, "dc", "identifier", array(), $row['doi'], "doi"); // - PMID: if (!empty($row['notes']) AND preg_match("/PMID *: *\d+/i", $row['notes'])) addMetaElement($record, "dc", "identifier", array(), $row['notes'], "pmid"); // - arXiv: if (!empty($row['notes']) AND preg_match("/arXiv *: *[^ ;]+/i", $row['notes'])) addMetaElement($record, "dc", "identifier", array(), $row['notes'], "arxiv"); // - ISBN: if (!empty($row['isbn'])) addMetaElement($record, "dc", "identifier", array(), $row['isbn'], "isbn"); // - OpenURL: addMetaElement($record, "dc", "identifier", array(), $openURL, "openurl"); // - refbase ID: addMetaElement($record, "dc", "identifier", array(), $databaseBaseURL . generateURL("show.php", "html", array("record" => $row['serial']), true), "url"); // - Cite key: addMetaElement($record, "dc", "identifier", array(), $citeKey, "citekey"); // - Bibliographic citation: // NOTE: In 'atomxml.inc.php', the bibliographic citation is put into a // 'dcterms:bibliographicCitation' element so that it can be uniquely // identified and extracted easily. However, in case of simple Dublin // Core output, we just put it into a 'dc:identifier' element and // use a "citation:" prefix. addMetaElement($record, "dc", "identifier", array(), encodeHTMLspecialchars($recordCitationPlain), "citation"); // - 'dc:source': // NOTE: - In , // Andy Powell et al. recommend that this element should NOT be used! // However, we use 'dc:source' elements for publication & series info // (publication/series title plus volume & issue) to provide a dedicated // source string that's easily readable and parsable. // Example: Polar Biology, Vol. 25, No. 10 // - While we could also append the page info to the publication // 'dc:source' element, this info is more pertinent to the article // itself and is thus not included. For 'srw_dc:dc' output, page info is // included in PRISM elements (see below). // - All metadata (including the page info) are also provided as a machine // parsable citation in form of an OpenURL ContextObject (see above). // - Publication info: // NOTE: We only include the 'dc:source' element for 'oai_dc:dc' output. In case of 'srw_dc:dc' // output, we use the more fine-grained PRISM elements instead (see below) if (($metadataPrefix == "oai_dc") AND (!empty($row['publication']) OR !empty($row['abbrev_journal']))) { if (!empty($row['publication'])) $source = $row['publication']; elseif (!empty($row['abbrev_journal'])) $source = $row['abbrev_journal']; if (!empty($row['volume'])) $source .= ", Vol. " . $row['volume']; if (!empty($row['issue'])) $source .= ", No. " . $row['issue']; if (!empty($source)) addMetaElement($record, "dc", "source", array(), $source); } // - Series info: if (!empty($row['series_title']) OR !empty($row['abbrev_series_title'])) { if (!empty($row['series_title'])) $series = $row['series_title']; elseif (!empty($row['abbrev_series_title'])) $series = $row['abbrev_series_title']; if (!empty($row['series_volume'])) $series .= ", Vol. " . $row['series_volume']; if (!empty($row['series_issue'])) $series .= ", No. " . $row['series_issue']; if (!empty($series)) addMetaElement($record, "dc", "source", array(), $series); // NOTE: To distinguish between regular publication & series info, // should we better use a "series:" prefix here? If so, use: // addMetaElement($record, "dc", "source", array(), $series, "series"); } // - ISSN: // NOTE: for 'srw_dc:dc' output, we put the ISSN into the 'prism:issn' element if (($metadataPrefix == "oai_dc") AND !empty($row['issn'])) addMetaElement($record, "dc", "source", array(), $row['issn'], "issn"); // - 'dc:date': if (!empty($row['year'])) addMetaElement($record, "dc", "date", array(), $row['year']); // - 'dc:type': if (!empty($row['type'])) addMetaElement($record, "dc", "type", array(), $row['type'], $row['thesis']); // In case of a thesis, we add another 'dc:type' element with the actual thesis type: if (!empty($row['thesis'])) addMetaElement($record, "dc", "type", array(), $row['thesis']); // - 'dc:format': // TODO: ideally, we should parse the content of the refbase 'medium' field and map it // to a media-type term from if (!empty($row['medium'])) $mediaType = $row['medium']; else $mediaType = "text"; addMetaElement($record, "dc", "format", array(), $mediaType); // - 'dc:subject': // TODO: add user-specific keywords (from field 'user_keys') if the user is logged in if (!empty($row['keywords'])) addMetaElement($record, "dc", "subject", array(), $row['keywords']); // - 'dc:coverage': // TODO: should we add contents from the refbase 'area' field as 'dc:coverage' element(s)? // - 'dc:relation': // - Related URL: if (!empty($row['url'])) addMetaElement($record, "dc", "relation", array(), $row['url'], "url"); // - Related FILE: if ($printURL) addMetaElement($record, "dc", "relation", array(), $URLprefix . $row['file'], "file"); // - 'dc:publisher': if (!empty($row['publisher'])) addMetaElement($record, "dc", "publisher", array(), $row['publisher']); // - 'dc:language': // TODO: convert to ISO notation (i.e. "en" instead of "English", etc) if (!empty($row['language'])) addMetaElement($record, "dc", "language", array(), $row['language']); // ---------------------------------------------------------- // Add PRISM elements: // NOTE: When using the 'srw_dc' namespace (i.e. 'info:srw/schema/1/dc-v1.1' as detailed at // ), I don't think it's allowed // to include anything but the fifteen elements from simple Dublin Core. Is this correct? // If so, then: // // TODO: Do we need to put the PRISM elements in instead? Or can we put them within // a separate branch outside of (and next to) the '' element? Or shall we better omit // them entirely? // More info on SRU Extra Data>: // // See also "Mixing DC metadata with other metadata schemas" in "Guidelines for implementing // Dublin Core in XML" if ($metadataPrefix == "srw_dc") // we only include PRISM elements for 'srw_dc:dc' output { // - 'prism:issn': if (!empty($row['issn'])) addMetaElement($record, "prism", "issn", array(), $row['issn']); // - 'prism:publicationName': if (!empty($row['publication'])) addMetaElement($record, "prism", "publicationName", array(), $row['publication']); elseif (!empty($row['abbrev_journal'])) addMetaElement($record, "prism", "publicationName", array(), $row['abbrev_journal']); // - 'prism:publicationDate': if (!empty($row['year'])) addMetaElement($record, "prism", "publicationDate", array(), $row['year']); // - 'prism:volume': if (!empty($row['volume'])) addMetaElement($record, "prism", "volume", array(), $row['volume']); // - 'prism:number': if (!empty($row['issue'])) addMetaElement($record, "prism", "number", array(), $row['issue']); // - 'prism:startingPage', 'prism:endingPage': // TODO: Similar code is used in 'include.in.php', 'modsxml.inc.php' and 'openurl.inc.php', // so this should be made into a dedicated function! if (!empty($row['pages']) AND preg_match("/\d+/i", $row['pages'])) // if the 'pages' field contains a number { $pages = preg_replace("/^\D*(\d+)( *[$dash]+ *\d+)?.*/i$patternModifiers", "\\1\\2", $row['pages']); // extract page range (if there's any), otherwise just the first number $startPage = preg_replace("/^\D*(\d+).*/i", "\\1", $row['pages']); // extract starting page $endPage = extractDetailsFromField("pages", $pages, "/\D+/", "[-1]"); // extract ending page (function 'extractDetailsFromField()' is defined in 'include.inc.php') // NOTE: To extract the ending page, we'll use function 'extractDetailsFromField()' // instead of just grabbing a matched regex pattern since it'll also work // when just a number but no range is given (e.g. when startPage = endPage) // - 'prism:startingPage': if (preg_match("/\d+ *[$dash]+ *\d+/i$patternModifiers", $row['pages'])) // if there's a page range addMetaElement($record, "prism", "startingPage", array(), $startPage); // - 'prism:endingPage': addMetaElement($record, "prism", "endingPage", array(), $endPage); } } return $record; } // -------------------------------------------------------------------- ?>