This Notebook is part of the Semantic Interoperability between Bibliographic Conceptual Models project, undertaken by the Ionian University Database & Information Systems Group. The project studies the Semantic Interoperability of selected models in the library and cultural heritage domains within the goal to identify good practices facilitating the conversion of one model’s instance to another model’s instance.
More specifically, it contains the necessary descriptions and code to transform BIBFRAME data to RDA. BIBFRAME 2.0 and RDA are two well-known models in libraries. BIBFRAME model have been developed by the Library of Congress to convert MARC records to linked data, while Resource Description and Access (RDA) is the de facto standard used in libraries worldwide to prepare the data in legacy MARC records for linked data.
The mapping part transforms, the BIBFRAME dataset to RDA by applying the xslt containing the mapping rules.
import os
import time
from functools import reduce
import copy
import numpy as np
import rdflib
import rdflib.util
from rdflib import Literal, Namespace, RDF, URIRef
from rdflib.namespace import SKOS, RDF, RDFS, OWL, DCTERMS
from urllib.parse import urlparse
import lxml.etree as ET
path = "C:\\ms\\development\\wsp_xml\\bf22rda\\"
bf_filename = "gold_bf2_derivations_dbis_20190324_instances_OnlyFamiliesC_sz20200510.rdf"
#bf_filename = "gold_bf2_derivations_dbis_20190324_instances_OnlyFamilies.rdf"
#bf_filename_test = "gold_bf2_derivations_instances_test.rdf"
# used namespaces
bf_ns = Namespace('http://id.loc.gov/ontologies/bibframe/')
bflc_ns = Namespace('http://id.loc.gov/ontologies/bflc/')
dbis_ns = Namespace('http://dbis.ionio.gr/Resources/library#')
eu_ns = Namespace('http://eurovoc.europa.eu/schema#')
thes_ns = Namespace('http://purl.org/iso25964/skos-thes#')
# Extend this map to include all define prefixes from all apps
prefixes = {
'rdfs': RDFS,
'skos': SKOS,
'dcterms': DCTERMS,
'bf': bf_ns,
'bflc': bflc_ns,
'dbis': dbis_ns
}
def uriref(prefixed_uri):
prefix, value = prefixed_uri.split(':', 1)
ns = prefixes[prefix]
return ns[value]
bf_graph = rdflib.Graph()
print("Start loading Bibframe Ontology at: ", time.asctime( time.localtime(time.time()) ))
bf_graph.parse(path + bf_filename, format='xml')
print("End loading Bibframe Ontology at: ", time.asctime( time.localtime(time.time()) ))
print("Graph size (graph statements): ", len(bf_graph))
#list(bf_graph.objects(subj_ref, status_ref))
hasExpression_ref = uriref('bf:hasExpression')
translation_ref = uriref('bf:translation') # for test see entity dbis_0000000017
translationOf_ref = uriref('bf:translationOf') # for test see entity dbis_0000000270
#list(bf_graph.subject_objects(hasExpression_ref))[:10]
for s, o in list(bf_graph.subject_objects(hasExpression_ref))[:10]:
print("Subject: %s -- Object: %s" % (s, o))
# Note make it a list otherwise you must use the results only once (and then run again)!!!
hasExpression_tuples_list = list(bf_graph.subject_objects(hasExpression_ref))
s_w_uris, t_w_uris = list(zip(*hasExpression_tuples_list))
starting_work_paths_uris = list(set(s_w_uris) - set(t_w_uris))
# Validate graph consistancy for bf:hasExpression and bf:expressionOf reciprocols relationships
expressionOf_ref = uriref('bf:expressionOf')
#expressionOf_tuples_list = list(bf_graph.subject_objects(expressionOf_ref))
#print(set(hasExpression_tuples_list) - set([(y, x) for x, y in expressionOf_tuples_list]))
print(set(hasExpression_tuples_list) - set([(y, x) for x, y in list(bf_graph.subject_objects(expressionOf_ref))]))
work_hasExpression_partitions = {}
for p in starting_work_paths_uris:
work_hasExpression_partitions[str(p)] = list(bf_graph.transitive_objects(p, hasExpression_ref))
# The set (unique) of all elements in hasExpression partitions
def elements_in_work_sets(work_partitions):
return set(reduce(lambda x, y: x + y, work_partitions.values()))
def stream_rdf_xml_work_sets(work_partitions):
id_prefix = "#dbis_W_hExpS_" # "#dbis_"
output = "<dbis:WorkSets>" +"\n"
for k, v in work_partitions.items():
# output += '<dbis:WorkSet rdf:about=\"' + str(k) + '">' +"\n"
output += '<dbis:WorkSet rdf:about=\"' + str(k).replace("#dbis_", id_prefix, 1) + '">' +"\n"
for i in v:
# output += "<dbis:Work>" + str(i) + "</dbis:Work>" +"\n"
output += "<dbis:hasMember rdf:about=\"" + str(i) + "\" />" +"\n"
output += "</dbis:WorkSet>" +"\n"
return output + "</dbis:WorkSets>"
Next cell for information only
# Run this cell for debug information, otherwise skip it
# Therefore uncomment the proper statements
#print(elements_in_work_sets(work_hasExpression_partitions))
#print(stream_rdf_xml_work_sets(work_hasExpression_partitions))
#for k, v in work_hasExpression_partitions.items():
# print("RDA work ID: %s, \n %s" % (k, v))
# print("--------------")
translation_ref = uriref('bf:translation') # for test see entity dbis_0000000017
translationOf_ref = uriref('bf:translationOf') # for test see entity dbis_0000000270
# Note make it a list otherwise you must use the results only once (and then run again)!!!
translation_tuples_list = list(bf_graph.subject_objects(translation_ref))
translationOf_tuples_list = list(bf_graph.subject_objects(translationOf_ref))
translations_with_reciprocols_set = set(translation_tuples_list).union(set([(y, x) for x, y in translationOf_tuples_list]))
translations_with_reciprocols_list = list(translations_with_reciprocols_set)
print(translations_with_reciprocols_list[0])
#print(len(translations_with_reciprocols_list))
#for x in translations_with_reciprocols_list:
# print("%s --- %s" % (str(x[0]), str(x[1])) )
# This version adds all level translation paths to the existed work sets!
# The approach uses Breadth First Search (BFS).
# The final set of non matched translations with existing worksets needs to
# further processed in order first to be partitioned and then to expand the initial
# partitions
#
# Attention!
# Run proper cells to initialize work_hasExpression_partitions
#work_extended_partitions = {}
work_extended_partitions = copy.deepcopy(work_hasExpression_partitions)
# note that the message "list index out of range" occurs when filter function results to empty list!
tmp_translations_set = translations_with_reciprocols_set
translations_set_prev_len = np.iinfo(np.int32).max
step = 0
while len(tmp_translations_set) and (translations_set_prev_len > len(tmp_translations_set)):
step += 1
print("==== Expansion %d" % step)
translations_set_prev_len = len(tmp_translations_set)
for k, v in work_extended_partitions.items():
v_set = set(v)
list_to_add = []
if(len(tmp_translations_set)):
for i in [0,1]:
filtered = {x for x in tmp_translations_set if x[i] in v_set}
if len(filtered):
list_to_add += list(list(zip(*list(filtered)))[1-i])
tmp_translations_set = tmp_translations_set - filtered
#print("list to add: %s" % str(list(set(list_to_add))))
#print("===")
# remove duplicates before add values to key in dictionary
#----work_extended_partitions[k] = (v, list(set(list_to_add)))
work_extended_partitions[k] = v + list(set(list_to_add))
#print("work_extended_partitions[k]: %s" % str(work_extended_partitions[k]))
#print("-----------------------")
print("Non matched translations in Work sets--- Previous step: %d, Current step: %d" % (translations_set_prev_len, len(tmp_translations_set)))
for s, t in list(tmp_translations_set):
print(str(s), " -- ", str(t))
#print("Non matched translations in Work sets: %d" % len(tmp_translations_set))
#for s, t in list(tmp_translations_set):
# print(str(s), " -- ", str(t))
# Before adding the remaining non matched translations to the worksets, partition them
# according to their translation paths
# e.g [(7, 8), (10, 11), (9, 10)] ==> {'7': [7, 8], '9': [9, 10, 11]}
# Note that keys are one value from pairs randomly selected!
# Note also that this code template is not enougph to accomplish the proccess
# correctly and must followw the code of the previous kernel!
# Some test cases, comment and uncomment
#tmp_translations_lst = [
#(rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000001108'), rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000001109')),
#(rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000000887'), rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000000902')),
#(rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000001033'), rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000001033')),
#(rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000000902'), rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000000894')),
#(rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000001823'), rdflib.term.URIRef('http://dbis.ionio.gr/Resources/library#dbis_0000001818')) ]
work_translation_partitions = {}
tmp_translations_lst = list(tmp_translations_set)
# ATTENTION!
# Pay attention to the lists of urls! When a list consist of one url element only
# list is converted to string!
# Further more, if again the value is passed to the list constructor it is converted
# to a list of characters!
# This is why translation_stmnt_lst and statments like translation_stmnt_lst.pop(0)
# are used
while tmp_translations_lst:
is_in_list = False
for k, v in work_translation_partitions.items():
#print("Key: %s, \nValue: %s" % (k, v))
v_set = set(v)
translation_stmnt_lst = list(tmp_translations_lst[0])
if tmp_translations_lst[0][0] in v_set:
translation_stmnt_lst.pop(0)
work_translation_partitions[k] = v + translation_stmnt_lst
is_in_list = True
elif tmp_translations_lst[0][1] in v_set:
translation_stmnt_lst.pop(1)
work_translation_partitions[k] = v + translation_stmnt_lst
is_in_list = True
if (len(set(tmp_translations_lst[0])) > 1) and not is_in_list:
# subject and object must not the same...
#print("Not in lists: %s" % str(tmp_translations_lst[0][0]))
work_translation_partitions[str(tmp_translations_lst[0][0])] = list(tmp_translations_lst[0])
#print(str(work_translation_partitions))
tmp_translations_lst.pop(0)
# print(work_partitions)
for k, v in work_translation_partitions.items():
print("RDA work ID: %s, \n %s" % (k, v))
print("--------------")
# Finally, append work_translation_partitions dictionary to work_extended_partitions
# Note that update overwrites existing keys, but with the above process
# keys in second dictionary do not exist in first!
work_extended_partitions.update(work_translation_partitions)
def elements_in_work_extended_sets(work_extended_partitions):
return set(reduce(lambda x, y: x+y, reduce(lambda x, y: x+y, work_extended_partitions.values())))
# Do not use this version! It is compatible ONLY when the Dictionary value
# is a pair of lists where the first contains the hasExpression property
# and the second the translations
def stream_rdf_xml_work_extended_sets_Old(work_extended_partitions):
id_prefix = "#dbis_W_hExpS_" # "#dbis_"
output = "<dbis:WorkSets>" +"\n"
for k, v in work_extended_partitions.items():
output += '<dbis:WorkSet rdf:about=\"' + str(k).replace("#dbis_", id_prefix, 1) + '">' +"\n"
for i in [0,1]:
for xs in v[i]:
output += "<dbis:hasMember rdf:resource=\"" + str(xs) + "\"" + " dbis:inclusionProperty=\"" + str(i) + "\"" + " />" +"\n"
output += "</dbis:WorkSet>" +"\n"
return output + "</dbis:WorkSets>"
def stream_rdf_xml_work_extended_sets(work_extended_partitions):
id_prefix = "#dbis_W_hExpS_" # "#dbis_"
output = "<dbis:WorkSets>" +"\n"
for k, v in work_extended_partitions.items():
output += '<dbis:WorkSet rdf:about=\"' + str(k).replace("#dbis_", id_prefix, 1) + '">' +"\n"
for xs in v:
output += "<dbis:hasMember rdf:resource=\"" + str(xs) + "\"" + " dbis:inclusionProperty=\"" + str(0) + "\"" + " />" +"\n"
output += "</dbis:WorkSet>" +"\n"
return output + "</dbis:WorkSets>"
Next cell for information only
# Run this cell for debug information, otherwise skip it
# Therefore uncomment the proper statements
# print(len(elements_in_work_extended_sets(work_extended_partitions)))
#print(elements_in_work_extended_sets(work_extended_partitions))
#print(elements_in_work_sets(work_extended_partitions))
# print(stream_rdf_xml_work_extended_sets(work_extended_partitions))
#for k in work_extended_partitions.keys():
# print(work_extended_partitions[k])
# print("----------------------")
Add code for otherEdition expansion
#bf:otherEdition bi-directional
otherEdition_ref = uriref('bf:otherEdition') # for test see entity dbis_0000000844
# Note make it a list otherwise you must use the results only once (and then run again)!!!
otherEdition_tuples_list = list(bf_graph.subject_objects(otherEdition_ref))
print(otherEdition_tuples_list[0])
# This version adds all level otherEdition paths to the existed work sets!
# The approach uses Breadth First Search (BFS).
# The final set of non matched otherEdition with existing worksets needs to
# further processed in order first to be partitioned and then to expand the initial
# partitions
#
# Attention!
# Run proper cells to initialize work_extended_partitions
work_extended_2ndP_partitions = copy.deepcopy(work_extended_partitions)
# note that the message "list index out of range" occurs when filter function results to empty list!
tmp_otherEditions_set = set(otherEdition_tuples_list)
otherEditions_set_prev_len = np.iinfo(np.int32).max
step = 0
while len(tmp_otherEditions_set) and (otherEditions_set_prev_len > len(tmp_otherEditions_set)):
step += 1
print("==== Expansion %d" % step)
otherEditions_set_prev_len = len(tmp_otherEditions_set)
for k, v in work_extended_2ndP_partitions.items():
v_set = set(v)
list_to_add = []
if(len(tmp_otherEditions_set)):
for i in [0,1]:
filtered = {x for x in tmp_otherEditions_set if x[i] in v_set}
if len(filtered):
list_to_add += list(list(zip(*list(filtered)))[1-i])
tmp_otherEditions_set = tmp_otherEditions_set - filtered
#print("list to add: %s" % str(list(set(list_to_add))))
#print("===")
# remove duplicates before add values to key in dictionary
#----work_extended_partitions[k] = (v, list(set(list_to_add)))
work_extended_2ndP_partitions[k] = v + list(set(list_to_add))
#print("work_extended_partitions[k]: %s" % str(work_extended_partitions[k]))
#print("-----------------------")
print("Non matched otherEditions in Work sets--- Previous step: %d, Current step: %d" % (otherEditions_set_prev_len, len(tmp_otherEditions_set)))
for s, t in list(tmp_otherEditions_set):
print(str(s), " -- ", str(t))
work_otherEdition_partitions = {}
tmp_otherEditions_lst = list(tmp_otherEditions_set)
# ATTENTION!
# Pay attention to the lists of urls! When a list consist of one url element only
# list is converted to string!
# Further more, if again the value is passed to the list constructor it is converted
# to a list of characters!
# This is why translation_stmnt_lst and statments like translation_stmnt_lst.pop(0)
# are used
while tmp_otherEditions_lst:
is_in_list = False
for k, v in work_otherEdition_partitions.items():
#print("Key: %s, \nValue: %s" % (k, v))
v_set = set(v)
otherEdition_stmnt_lst = list(tmp_otherEditions_lst[0])
if tmp_otherEditions_lst[0][0] in v_set:
otherEdition_stmnt_lst.pop(0)
work_otherEdition_partitions[k] = v + otherEdition_stmnt_lst
is_in_list = True
elif tmp_otherEditions_lst[0][1] in v_set:
otherEdition_stmnt_lst.pop(1)
work_otherEdition_partitions[k] = v + otherEdition_stmnt_lst
is_in_list = True
if (len(set(tmp_otherEditions_lst[0])) > 1) and not is_in_list:
# subject and object must not the same...
#print("Not in lists: %s" % str(tmp_otherEditions_lst[0][0]))
work_otherEdition_partitions[str(tmp_otherEditions_lst[0][0])] = list(tmp_otherEditions_lst[0])
#print(str(work_otherEdition_partitions))
tmp_otherEditions_lst.pop(0)
# print(work_partitions)
for k, v in work_otherEdition_partitions.items():
print("RDA work ID: %s, \n %s" % (k, v))
print("--------------")
# Finally, append work_otherEdition_partitions dictionary to work_extended_2ndP_partitions
# Note that update overwrites existing keys, but with the above process
# keys in second dictionary do not exist in first!
work_extended_2ndP_partitions.update(work_otherEdition_partitions)
Skip next cell! It is informative for debugging purposes
# Next are statements for information retalated to the pertion subsets.
# It is informative for debuging but not necessary for data transformation.
work_c_ref = uriref('bf:Work')
text_c_ref = uriref('bf:Text')
cartography_c_ref = uriref('bf:Cartography')
audio_c_ref = uriref('bf:Audio')
notatedMusic_c_ref = uriref('bf:NotatedMusic')
notatedMovement_c_ref = uriref('bf:NotatedMovement')
dataset_c_ref = uriref('bf:Dataset')
stillImage_c_ref = uriref('bf:StillImage')
movingImage_c_ref = uriref('bf:MovingImage')
object_c_ref = uriref('bf:Object')
multimedia_c_ref = uriref('bf:Multimedia')
mixedMaterial_c_ref = uriref('bf:MixedMaterial')
work_and_subclasses_ref_list = [work_c_ref, text_c_ref, cartography_c_ref, audio_c_ref, notatedMusic_c_ref,
notatedMovement_c_ref, dataset_c_ref, stillImage_c_ref, movingImage_c_ref,
object_c_ref, multimedia_c_ref, mixedMaterial_c_ref]
all_works_with_subclasses_in_graph = list(reduce(lambda x, y: x+y,
[list(bf_graph.subjects(RDF.type, x_ref)) for x_ref in work_and_subclasses_ref_list]))
print("Total Number of bf:Works in graph: %d \n" % len(all_works_with_subclasses_in_graph))
#print(all_works_with_subclasses_in_graph[0:10])
print("Number of partition subsets with more than ONE bf:Works: %d" % len(work_extended_2ndP_partitions))
elements_in_2ndP_partitions = list(map(lambda x: str(x), list(elements_in_work_sets(work_extended_2ndP_partitions))))
print("Total Number of bf:Works in the above subsets: %d \n" % len(elements_in_2ndP_partitions))
#print(elements_in_2ndP_partitions[0:10])
#all_works_with_subclasses_in_graph = list(bf_graph.subjects(RDF.type, audio_c_ref))
#print(len(all_works_with_subclasses_in_graph))
#not_in_work_subsets = set(all_works_with_subclasses_in_graph).intersection(set(elements_in_2ndP_partitions))
not_in_work_subsets = set(map(lambda x: str(x), all_works_with_subclasses_in_graph)) - set(elements_in_2ndP_partitions)
print("Number of trivial partition subsets (i.e. subsets containing just one bf:Work): %d" % len(not_in_work_subsets))
print("--------- bf:Works in trivial partition subsets")
for xs in not_in_work_subsets:
print(xs)
bf2rda_xsl_filename = "BF22RDA_mapping_WExS_xml_v3.xsl"
out_xml_filename = "bf2rda_Mapping_derivations_dbis_20200510_Instances_OnlyFamiliesC_sz20200510.rdf"
bf_dom = ET.parse(path + bf_filename)
bf2rda_xslt = ET.parse(path + bf2rda_xsl_filename)
nameSpaces_str = "<rdf:RDF xmlns:dbis=\"http://dbis.ionio.gr/Resources/library/\">"
root_element_str = '<rdf:RDF xmlns="http://dbis.ionio.gr/Datasets/GDrvBF2_v1.0.0#" \
xml:base="http://dbis.ionio.gr/Datasets/GDrvBF2_v1.0.0" \
xmlns:bf="http://id.loc.gov/ontologies/bibframe/" \
xmlns:ns="http://www.w3.org/2003/06/sw-vocab-status/ns#" \
xmlns:owl="http://www.w3.org/2002/07/owl#" \
xmlns:dbis="http://dbis.ionio.gr/Resources/library#" \
xmlns:xsd="http://www.w3.org/2001/XMLSchema#" \
xmlns:skos="http://www.w3.org/2004/02/skos/core#" \
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" \
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" \
xmlns:terms="http://purl.org/dc/terms/" \
xmlns:bflc="http://id.loc.gov/ontologies/bflc/" \
xmlns:xml="http://www.w3.org/XML/1998/namespace" \
xmlns:wot="http://xmlns.com/wot/0.1/" \
xmlns:foaf="http://xmlns.com/foaf/0.1/" \
xmlns:dc="http://purl.org/dc/elements/1.1/">'
#work_extended_sets_ele = ET.fromstring(root_element_str + stream_rdf_xml_work_extended_sets(work_extended_partitions) + "</rdf:RDF>")
work_extended_sets_ele = ET.fromstring(root_element_str + stream_rdf_xml_work_extended_sets(work_extended_2ndP_partitions) + "</rdf:RDF>")
work_extended_sets_dom = ET.ElementTree(work_extended_sets_ele)
#print(ET.tostring(work_extended_sets_dom, xml_declaration=False, encoding='unicode'))
#elements_to_change_type = list(map(lambda x: str(x), list(elements_in_work_extended_sets(work_extended_partitions))))
# Use this version, see the comment in function definitions for the modification in work_extended_partitions
#elements_to_change_type = list(map(lambda x: str(x), list(elements_in_work_sets(work_extended_partitions))))
elements_to_change_type = list(map(lambda x: str(x), list(elements_in_work_sets(work_extended_2ndP_partitions))))
for e in bf_dom.findall("/{http://www.w3.org/2002/07/owl#}NamedIndividual"):
if e.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about") in elements_to_change_type:
e.tag="{http://dbis.ionio.gr/Resources/library#}WorkSetMember"
#bf_dom.write(path + "test_inserted_xml_code_20200509.rdf")
print(len(elements_to_change_type))
for x in elements_to_change_type:
print(str(x))
#USE THIS!!! to insert the work sets
bf_dom.getroot().extend(work_extended_sets_dom.getroot())
#bf_dom.write(path + "test_insert_code_allProcessed_20200509.rdf", xml_declaration=True, encoding='UTF-8')
print("Start transforming Bibframe Ontology to RDA at: ", time.asctime( time.localtime(time.time()) ))
transform = ET.XSLT(bf2rda_xslt)
rda_dom = transform(bf_dom)
print("End transforming Bibframe Ontology to RDA at: ", time.asctime( time.localtime(time.time()) ))
print("Tranfrormation done!")
Write rda mapped rdf/xml ontology
# next OK, but with < etc
rda_dom.write_output(path + out_xml_filename)
print ("Transformation saved... check file in path!")