FetchSitemapFiles

From Wiki of the E-Business and Web Science Research Group
Jump to: navigation, search

Python script for the bulk download of RDF resources based on a Semantic Sitemap

Code

 
 
#!/usr/bin/env python
# encoding: utf-8
"""
FetchSitemapFiles.py
 
Bulk Download of RDF content using a Semantic Sitemap
 
This program downloads all data sources specified in the given semantic sitemap
and saves it as a set of numbered files (1.rdf, 2.rdf,...) in a given subdirectory.
 
Created by Martin Hepp on 2009-06-15.
This software is free software under the LPGL.
 
Acknowledgements: Some inspiration on using urllib came from Doug Hellmann,
http://www.doughellmann.com/PyMOTW/urllib/
"""
 
import urllib
from xml.dom import minidom
 
SITEMAP_URI = 'http://rdf4ecommerce.esolda.com/sitemap.xml' # insert URI of your sitemap here
SEMSITEMAP_NS = 'http://sw.deri.org/2007/07/sitemapextension/scschema.xsd'
RDF_DIRECTORY = 'rdfdownloads/'
 
def get_listof_URIs(sitemapURI):
	"""Extract a list of all data dump locations from the semantic sitemap at <sitemapURI>"""
	sitemap = urllib.urlopen(sitemapURI)
	dom = minidom.parse(sitemap)
	resources = []
	elements = dom.getElementsByTagNameNS(SEMSITEMAP_NS,'dataDumpLocation')
	for location in elements:
		resources.append(str(location.firstChild.data))
	return resources
 
# Main 
uri_list = get_listof_URIs(SITEMAP_URI)
counter = 1
total = len(uri_list)
for address in uri_list:
	print "URI %d of %d: %s" % (counter, total, address)
	fname = RDF_DIRECTORY+str(counter)+".rdf"
	urllib.urlretrieve(address, fname)
	counter = counter + 1	
 
 
Personal tools
Navigation