Python script for the bulk download of RDF resources based on a Semantic Sitemap
#!/usr/bin/env python
# encoding: utf-8
"""
FetchSitemapFiles.py
Bulk Download of RDF content using a Semantic Sitemap
This program downloads all data sources specified in the given semantic sitemap
and saves it as a set of numbered files (1.rdf, 2.rdf,...) in a given subdirectory.
Created by Martin Hepp on 2009-06-15.
This software is free software under the LPGL.
Acknowledgements: Some inspiration on using urllib came from Doug Hellmann,
http://www.doughellmann.com/PyMOTW/urllib/
"""
import urllib
from xml.dom import minidom
= 'http://rdf4ecommerce.esolda.com/sitemap.xml' # insert URI of your sitemap here
SITEMAP_URI = 'http://sw.deri.org/2007/07/sitemapextension/scschema.xsd'
SEMSITEMAP_NS = 'rdfdownloads/'
RDF_DIRECTORY
def get_listof_URIs(sitemapURI):
"""Extract a list of all data dump locations from the semantic sitemap at <sitemapURI>"""
= urllib.urlopen(sitemapURI)
sitemap = minidom.parse(sitemap)
dom = []
resources = dom.getElementsByTagNameNS(SEMSITEMAP_NS,'dataDumpLocation')
elements for location in elements:
str(location.firstChild.data))
resources.append(return resources
# Main
= get_listof_URIs(SITEMAP_URI)
uri_list = 1
counter = len(uri_list)
total for address in uri_list:
print "URI %d of %d: %s" % (counter, total, address)
= RDF_DIRECTORY+str(counter)+".rdf"
fname
urllib.urlretrieve(address, fname)= counter + 1 counter