This script uses a few Perl modules (see lines: 36-45 and 540 of the source code). In order to install them, execute the following command (in Ubuntu):
apt-get install libhttp-oai-perl libwww-mechanize-perl libxml-saxon-xslt2-perlYou can also install them using 'cpanm Module::Name' command.
wget http://home.agh.edu.pl/~polak/skrypty/pmh-convert.pl chmod 755 pmh-convert.plAfter that, edit the script, read carefully the information contained in the comments (at the beginning of the script), and follow the steps provided there.
#!/usr/bin/perl #################################################################################################### # PMH_CONVERT.PL written by Stanislaw Polak # http://home.agh.edu.pl/~polak/ #################################################################################################### # By default, the script only converts OAI-PMH data to DOAJ. # If you copy http://home.agh.edu.pl/~polak/skrypty/DBLP.xsl file, # then it will also convert them to DBLP. # If you copy http://home.agh.edu.pl/~polak/skrypty/PBN.xsl file, # then it will also convert them to PBN. #################################################################################################### # If you want to display documentation, then execute # the following command: './pmh-convert.pl --man' # or visit http://home.agh.edu.pl/~polak/skrypty/pmh-convert.man.php #################################################################################################### # BEFORE YOU START USING THIS SCRIPT, # CHANGE THE VALUE OF THE FOLLOWING VARIABLES: #################################################################################################### my $OAI_SITE = 'http://journals.agh.edu.pl/csci/oai'; #URL of the source data (OAI-PMH data) my $JOURNAL_NAME = 'Computer Science Journal'; #The name of your journal my $JOURNAL_KEY = 'csci'; #Journal key for DBLP my $JOURNAL_PUBLISHER = 'AGH University of Science and Technology Press'; #The publisher of the journal my $JOURNAL_ADDRESS = 'Krakow'; #Publisher's address my $JOURNAL_URL = 'http://journals.agh.edu.pl/csci/'; #URL of the journal my $JOURNAL_ISSN_PRINT = '1508-2806'; #ISSN of your journal of print edition my $JOURNAL_ISSN_ONLINE = '2300-7036'; #ISSN of your journal of electronic edition my $DOAJ_FILENAME = 'csci-doaj.xml'; #The name of the output DOAJ file my $DBLP_FILENAME = 'csci-dblp.xml'; #The name of the output DBLP file my $PBN_FILENAME = 'csci-pbn.xml'; #The name of the output PBN file #################################################################################################### ################### DO NOT EDIT BELOW THIS LINE #################################################### #################################################################################################### #The modules used by the script ############################# use HTTP::OAI; use strict; use warnings; use LWP::Simple qw(get); use Locale::Country; use Locale::Language; use WWW::Mechanize; use HTML::FormatText; use Getopt::Long (); # Resist name-space pollution! use Pod::Usage (); # Ditto! #################################################################################################### #binmode STDOUT, ":utf8"; $main::VERSION = "1.01"; #Version of the script my $DOAJ_SITE = 'http://www.doaj.org/uploadFile'; #Where the $DOAJ_FILENAME should be sent ################################################################################################## #Creates 'doaj_name' element' and puts into it the contents (text node) of the 'oai_name' element ################################################################################################## sub OAIElementCopyContent { my $oai_name=shift; #a name of existing oai element my $doaj_name=shift;#a name of created doaj element my $oai_rec=shift; #input OAI record my $doaj_doc=shift; #output XML document ############################################################################## #get the 'oai_name' element from the DOM tree associated with the OAI record ############################################################################## my $data = $oai_rec->metadata->dom->getElementsByTagName ($oai_name); if($data){ ############################ #if the element exists, then ############################# my $content = $data->[0]->textContent; #get its context (text node) my $element = $doaj_doc->createElement($doaj_name); #create output 'doaj_name' element $element->appendText($content); #and put to it, the content of the input 'oai_name' element return $element; } #if($data) } #sub OAIElementCopyContent ################################### #Converts all records of OAI document within the specified date range (date_from/date_until) into DOAJ document ################################### sub convertToDOAJ { my %oai_request_params = %{shift()}; print "\033[1mOpening $OAI_SITE site\033[0m\n"; #################################################################### #connect to the OAI page specified in the global variable 'OAI_SITE' #################################################################### my $h = new HTTP::OAI::Harvester(baseURL=>$OAI_SITE); ################################################# #get all records within the specified date range ################################################# my $response = $h->ListRecords( %oai_request_params); if( $response->is_error ) { die("Error harvesting: " . $response->message . "\n"); } my $message="for all records"; #a message containg information which records are taken into account $message= "for records belonging to the interval from $oai_request_params{'from'}" if exists $oai_request_params{'from'}; $message .= " until $oai_request_params{'until'}" if exists $oai_request_params{'until'}; print "\033[1mGenerating $DOAJ_FILENAME file $message\033[0m\n"; ##################################### #create an empty output DOAJ document ##################################### my $doaj_doc = XML::LibXML->createDocument("1.0","UTF-8"); ########################################################################## #create the root element 'records' and append it to the the DOAJ document ########################################################################## my $records = $doaj_doc->createElement("records" ); $doaj_doc->setDocumentElement( $records ); ####################### #Go through all records ####################### while(my $oai_rec = $response->next ) { print $oai_rec->identifier,"\n"; ################################################################################################################ #get the data contained in the element "dc:source" #this element contains: "the name of a journal ; a volume number and an issue number ; start page of the article" ################################################################################################################ my @source; my $data = $oai_rec->metadata->dom->getElementsByTagName ("dc:source"); if($data) { ############################################################################################# #Split the content of the "dc:source" into a list of strings and put it in the 'source' array #the first element of this array contains: the name of a journal, #the second one, contains: a volume number and an issue number #the third one contains: start page of the article ############################################################################################# @source=split(/;/,$data->[0]->textContent); } ####################### #Create output elements ####################### ####################### ############################ #create the 'record' element ############################ my $record = $doaj_doc->createElement("record" ); ################################################################## #create the 'language' element based on the "dc:language" element ################################################################## $data = $oai_rec->metadata->dom->getElementsByTagName ("dc:language"); if($data){ my $lang = $data->[0]->textContent; if($lang){ ############################################# #Convert from ISO 639-1 standard to iso 639-2b # if necessary ############################################# my $lg= code2language($lang); if($lg) { $lang = language2code($lg, LOCALE_CODE_ALPHA_3); #convert } #if($lg) my $language = $doaj_doc->createElement("language" ); $language->appendText($lang); $record->appendChild($language); } #if($lang) }# if($data) ################################################################################################## #create the 'publisher' element and put into it, the content of the (input) "dc:publisher" element ################################################################################################## $record->appendChild(OAIElementCopyContent("dc:publisher","publisher",$oai_rec,$doaj_doc)); #################################################################################################### #create the 'journalTitle' element - name of the journal is the first element of the 'source' array #################################################################################################### if($source[0]){ my $journal = $doaj_doc->createElement("journalTitle"); $journal->appendText($source[0]); $record->appendChild($journal); } ###################################################################### #create the 'issn' element based on the 'JOURNAL_ISSN_PRINT' global variable ###################################################################### if($JOURNAL_ISSN_PRINT){ my $issn = $doaj_doc->createElement("issn"); $issn->appendText($JOURNAL_ISSN_PRINT); $record->appendChild($issn); } ################################################################################################### #create the 'publicationDate' element and put into it, the content of the (input) "dc:date" element ################################################################################################### #$record->appendChild(OAIElementCopyContent("dc:date","publicationDate",$oai_rec,$doaj_doc)); ################################################################################################ #create the 'publicationDate' element based on information in the second "dc:identifier" element ################################################################################################ $data=$oai_rec->metadata->dom->getElementsByTagName ("dc:identifier"); if($data){ if($data->[1]->textContent =~ /csci\.(\d+).*/){ my $publicationDate = $doaj_doc->createElement("publicationDate"); $publicationDate->appendText($1); $record->appendChild($publicationDate); } } ############################################################################################################ #create the 'volume' and the 'issue' elements based on the data in the second element of the 'source' array ############################################################################################################ if($source[1]){ if($source[1] =~ /Vol (\d+)/){ my $volume = $doaj_doc->createElement("volume"); $volume->appendText($1); $record->appendChild($volume); } if($source[1] =~ /No (\d+)/){ my $issue = $doaj_doc->createElement("issue"); $issue->appendText($1); $record->appendChild($issue); } } ############################################################################################ #create the 'startPage' element based on the data in the third element of the 'source' array ############################################################################################ if($source[2]){ my $startPage = $doaj_doc->createElement("startPage"); $source[2] =~ s/^\s+//; #remove white spaces $startPage->appendText($source[2]); $record->appendChild($startPage); } ######################################################################################### #create the 'doi' element based on the data in the second (input) "dc:identifier" element ######################################################################################### $data=$oai_rec->metadata->dom->getElementsByTagName ("dc:identifier"); if($data){ my $doi = $doaj_doc->createElement("doi"); $doi->appendText($data->[1]->textContent); $record->appendChild($doi); } ################################################################################################ #create the 'documentType' element and put into it, the content of the (input) "dc:type" element ################################################################################################ $record->appendChild(OAIElementCopyContent("dc:type","documentType",$oai_rec,$doaj_doc)); ################################################################################################ #create the 'title' element, containing the 'language' attribute; put into the content of the element, the content of the (input) "dc:type" element ################################################################################################ $data=$oai_rec->metadata->dom->getElementsByTagName ("dc:title"); if($data){ my $title = $doaj_doc->createElement("title"); $title->setAttribute("language","eng"); my $string = $data->[0]->textContent; $string =~ s/([\w']+)/\u\L$1/g; #titlecase $title->appendText($string); $record->appendChild($title); } ############################################# #create the 'authors' element and its content ############################################ my $authors = $doaj_doc->createElement("authors"); my @affiliations=(); #create an empty array of authors' affiliations ################################################################################################## #Go through all authors - each (input) "dc:creator" element contains information about each author ################################################################################################## foreach $data ($oai_rec->metadata->dom->getElementsByTagName ("dc:creator")){ my $aff_id; #affiliation id my $author = $doaj_doc->createElement("author"); #create the 'author' element ############################################################################################### #Split the content of the "dc:creator" into a list of strings and put it in the 'creator' array #the first element of this array contains: "the last name of the author, the first name of the author" #the second one, contains: the first affiliation of the author #the third one contains: the second affiliation of the author, etc. ############################################################################################### my @creator = split(/;/,$data->textContent); ####################################################################################### #create the 'name' element based on the data in the first element of the 'creator' array ####################################################################################### if($creator[0]){ my $name = $doaj_doc->createElement("name"); $creator[0] =~ /(.+), (.+)/; #swap first name and last name $name->appendText("$2 $1"); $author->appendChild($name); } ########################################################################################### #create the 'email' element, provided that author's affiliation contains the email address ########################################################################################### if($data->textContent =~ /\b([^\s]+@[^\s]+)\b/g){ #find email addresses in author's affiliation my @address= grep {/\@/} $1; #and remember them in the 'address' array if($address[0]){ #if it was found, then create the output element based on the first email address my $email = $doaj_doc->createElement("email"); $email->appendText($address[0]); $author->appendChild($email); } } ######################################################################### #create the 'affiliationId' element and an array of (unique) affiliations ######################################################################### if(scalar(@creator) >= 2){ ############################################################################################################### #if the size of the 'creator' array is greater than 2, this means that the author has at least one affiliation ############################################################################################################### my ($affiliationId); for(my $i=1 ; $i<scalar(@creator);$i++){ #Go through all author affiliations $creator[$i] =~ s/^\s+//; #Remove white spaces if( $creator[$i] =~ /\b([^\s]+@[^\s]+)\b/g){ #if the affiliation contains the email address $creator[$i] = $`.$'; #then remove the email address next if ($` eq '' and $' eq ''); #if the affiliation contains only the email address, ignore this affiliation } $creator[$i] =~ s/^\s+//; #Remove white spaces $creator[$i] =~ s/,$//; #Remove last ", ######################################################################### #Check if author's affiliation already exists in a table of affiliations ######################################################################### if($creator[$i] ~~ @affiliations){#If it exists then remember the index $aff_id=$i; } else { ############################################################## #If it does not exists, then add the affiliation to the table ############################################################## push(@affiliations,$creator[$i]); $aff_id=$#affiliations+1; #affiliation id is the actual size of the table } ################################### #create the 'affiliationId' element ################################### $affiliationId = $doaj_doc->createElement("affiliationId"); $affiliationId->appendText($aff_id); $author->appendChild($affiliationId); }#for(my $i=1 } #if(scalar(@creator) >= 2 $authors->appendChild($author); } #for dc:creator $record->appendChild($authors); ########################################################### #create the 'affiliationsList' element and its subelements, provided that the array of affiliations contains any element ########################################################### if(@affiliations){ ################################## #if the array contains any element ################################## my $affiliationsList = $doaj_doc->createElement("affiliationsList"); #create the 'affiliationsList' element my $aff_id=1; foreach my $aff_name (@affiliations){ ########################################## #Go through all affiliations #and create the 'affiliationName' element ########################################## my $affiliationName = $doaj_doc->createElement("affiliationName"); $affiliationName->setAttribute("affiliationId",$aff_id); $aff_name =~ s/^\s+//; #Remove white spaces $affiliationName->appendText($aff_name); $affiliationsList->appendChild($affiliationName); $record->appendChild($affiliationsList); $aff_id++; } } ########################################################## #create the 'abstract' element, containing the 'language' attribute; put into the content of the element, the content of the (input) "dc:description" element ########################################################## $data=$oai_rec->metadata->dom->getElementsByTagName ("dc:description"); if($data){ my $abstract = $doaj_doc->createElement("abstract"); $abstract->setAttribute("language","eng"); $abstract->appendText($data->[0]->textContent); $record->appendChild($abstract); } ################################################################# #create the 'fullTextUrl' element, based on the information contained on the website (target page), whose address is contained in the first element 'dc:identifier' #the target page contains a link to a full (PDF) version of the article ################################################################# $data=$oai_rec->metadata->dom->getElementsByTagName ("dc:identifier"); if($data){ my $fullTextUrl = $doaj_doc->createElement("fullTextUrl"); $fullTextUrl->setAttribute("format","pdf"); ########################################### #Go through the content of the target page ########################################### for my $line (split qr/\R/, get($data->[0]->textContent)) { if($line =~ /name="citation_pdf_url" content="(.*)"/){ #find the link $fullTextUrl->appendText($1); #and remember this link last; } } $record->appendChild($fullTextUrl); } ###################################################################################### #create the 'keywords' element and its subelements, based on the "dc:subject" element ###################################################################################### $data=$oai_rec->metadata->dom->getElementsByTagName ("dc:subject"); if($data){ my $keywords = $doaj_doc->createElement("keywords"); $keywords->setAttribute("language","eng"); ######################################################################################################## #the input element "dc:subject" contains keywords as a string, and all of them are separated by a comma #in the output element, each keyword must be a single element 'keyword' ######################################################################################################## my @words=split(/,/,$data);#Place each keyword in a separate element of the 'words' array foreach my $item (@words){ #Go through all keywords $item =~ s/^\s+//; #Remove white spaces #################################################################################### #put each keyword (each element of the 'words' array) in a single element 'keyword' #################################################################################### my $keyword = $doaj_doc->createElement("keyword"); $keyword->appendText($item); $keywords->appendChild($keyword); } $record->appendChild($keywords); } $records->appendChild($record); }#while(my $oai_rec = $response->next $doaj_doc->toFile($DOAJ_FILENAME,2); #put the output document in the file print "\033[1mThe file has been generated\033[0m\n" } #sub convertToDOAJ ############################################ #Sends DOAJ document to a remote repository ########################################### sub sendDOAJ { my $login=shift; my $password=shift; my $mech = WWW::Mechanize->new(); ############### # Authorization ############### $mech->credentials($login, $password); #Provide credentials to be used for HTTP Basic authentication $mech->get($DOAJ_SITE); #Fetch a page with a HTML form die unless ($mech->success); ############################################### # Filling out the form - sending the DOAJ file ############################################## print "\033[1mI am sending $DOAJ_FILENAME to $DOAJ_SITE\033[0m\n"; $mech->submit_form( form_number => 1, fields => { fileName => $DOAJ_FILENAME, } ); ######################### # Displaying the response ######################### my $string = HTML::FormatText->format_string( $mech->content, leftmargin => 0, rightmargin => 50 ); print $string; }#sub sendDOAJ ############################################################# #Converts the DOAI document into DBLP document using XSLT 2.0 ############################################################# sub convertToDBLP { print "\033[1mGenerating $DBLP_FILENAME file\033[0m\n"; open(my $input, '<:encoding(UTF-8)', $DOAJ_FILENAME) or die $!; #Open DOAJ file open(my $xslt, '<:encoding(UTF-8)', 'DBLP.xsl') or die $!; #Open XSLT file my $trans = XML::Saxon::XSLT2->new($xslt); #Transformation parameters prior to running the transformation $trans->parameters( journal => $JOURNAL_NAME, issn_online => $JOURNAL_ISSN_ONLINE, issn_print => $JOURNAL_ISSN_PRINT, key => $JOURNAL_KEY, publisher => $JOURNAL_PUBLISHER, address => $JOURNAL_ADDRESS, url => $JOURNAL_URL ); my $output = $trans->transform($input); #Transform the DOAJ file into DBLP file open (DBLP, ">$DBLP_FILENAME"); #and save it binmode DBLP, ":utf8"; print DBLP $output; close DBLP; print "\033[1mThe file has been generated\033[0m\n"; }#sub sendDBLP ############################################################# #Converts the DOAI document into PBN document using XSLT 2.0 ############################################################# sub convertToPBN { print "\033[1mGenerating $PBN_FILENAME file\033[0m\n"; open(my $input, '<:encoding(UTF-8)', $DOAJ_FILENAME) or die $!; #Open DOAJ file open(my $xslt, '<:encoding(UTF-8)', 'PBN.xsl') or die $!; #Open XSLT file my $trans = XML::Saxon::XSLT2->new($xslt); #Transformation parameters prior to running the transformation $trans->parameters( issn_online => $JOURNAL_ISSN_ONLINE, issn_print => $JOURNAL_ISSN_PRINT, ); my $output = $trans->transform($input); #Transform the DOAJ file into PBN file open (DBLP, ">$PBN_FILENAME"); #and save it binmode DBLP, ":utf8"; print DBLP $output; close DBLP; print "\033[1mThe file has been generated\033[0m\n"; }#sub sendDBLP ############################################################################################ ###################################### ##### Main block ##################### ###################################### #http://sphaerula.com/legacy/Perl/Intermediate/podusage.html#manOption Getopt::Long::Configure('auto_version'); #Automatically provide support for the --version option if the application did not specify a handler for this option itself. my($man, $help, $doajLogin, $doajPass, $from, $until, %oai_request_params, $response); Getopt::Long::GetOptions( 'help|?' => \$help, 'man' => \$man, 'doajLogin=s' => \$doajLogin, 'doajPass=s' => \$doajPass ); Pod::Usage::pod2usage(1) if $help; #Displaying help Pod::Usage::pod2usage(-verbose => 2,-exitstatus => 0) if $man; #Displaying manual #unless ( defined( $from) && defined( $until ) ){ # Pod::Usage::pod2usage(-exitstatus => 2 ); #} #Create a hash with an OAI request parameters $oai_request_params{'metadataPrefix'}='oai_dc'; $oai_request_params{'from'} = $ARGV[0] if defined($ARGV[0]); #start date $oai_request_params{'until'} = $ARGV[1] if defined($ARGV[1]); #end date #Convert records to DOAJ format convertToDOAJ(\%oai_request_params); #Send DOAJ document if login and password were given if(defined($doajLogin) && defined($doajPass)){ sendDOAJ($doajLogin,$doajPass); } use if -e 'DBLP.xsl' || -e 'PBN.xsl', 'XML::Saxon::XSLT2'; #Conditional loading module #Convert the DOAJ document to DBLP format, if 'DBLP.xsl' is present if(-e 'DBLP.xsl'){ convertToDBLP(); } else { print "\033[31mIf you want to convert DOAI document to DBLP format, you should place, in the current directory, file http://home.agh.edu.pl/~polak/skrypty/DBLP.xsl\033[0m\n"; } #Convert the DOAJ document to PBN format, if 'PBN.xsl' is present if(-e 'PBN.xsl'){ convertToPBN(); } else { print "\033[31mIf you want to convert DOAI document to PBN format, you should place, in the current directory, file http://home.agh.edu.pl/~polak/skrypty/PBN.xsl\033[0m\n"; } __END__ =head1 NAME pmh-convert.pl - converts an OAI-PMH document into the DOAJ format; optionally also into the DBLP or PBN. =head1 SYNOPSIS pmh-convert.pl [--help] [--man] [--version] [--doajLogin username] [--doajPass password] [startDate] [endDate] Examples: ./pmh-convert.pl # Generate, in the current directory, DOAJ file # for the entire period of time, i.e., for all (input) records. ./pmh-convert.pl 2013-01-01 2013-07-30 # Generate, in the current directory, DOAJ file # for a given period of time. ./pmh-convert.pl --doajLogin 1234 --doajPass abcd 2013-01-01 2013-07-30 # Generate, in the current directory, DOAJ file # for a given period of time, and send DOAJ file to the DOAJ site; # sign in using '1234' as a username and 'abcd' as a password. =head1 DESCRIPTION This script converts an PMH-OAI document into DOAJ, and optionally sends DOAJ output file to DOAJ repository. The script can also convert the data into DBLP format or PBN format, but by default, the DBLP file / PBN file is not generated. If you copy the file L<http://home.agh.edu.pl/~polak/skrypty/DBLP.xsl>, and put it in the current directory, then the script, during its operation, will also generate an output file in DBLP format. If you copy the file L<http://home.agh.edu.pl/~polak/skrypty/PBN.xsl>, and put it in the current directory, then the script, during its operation, will also generate an output file in PBN format. =head1 ARGUMENTS pmh-convert.pl takes the following arguments: =over 4 =item help --help (Optional.) Displays the usage message. =item man --man (Optional.) Displays all documentation. =item version --version (Optional.) Displays the current version of the script. =item doajLogin --doajLogin login (Optional.) Specifies the login name used during authentication on DOAJ repository. =item doajPass --doajPass password (Optional.) Specifies the password used during authentication on DOAJ repository. =item startDate (Optional.) Specifies the start date for a period of time. =over 8 =item date Format ISO 8601, see http://www.w3.org/TR/NOTE-datetime =back =item endDate (Optional.) Specifies the end date for a period of time. =over 8 =item date Format ISO 8601, see http://www.w3.org/TR/NOTE-datetime =back =back =head1 AUTHOR Stanislaw Polak, E<lt>polak@icsr.agh.edu.plE<gt>. =head1 COPYRIGHT This program is distributed under the Artistic License. =head1 DATE 31-07-2013 =cut