FetchSitemapFiles
From Wiki of the E-Business and Web Science Research Group
Python script for the bulk download of RDF resources based on a Semantic Sitemap
Code
#!/usr/bin/env python # encoding: utf-8 """ FetchSitemapFiles.py Bulk Download of RDF content using a Semantic Sitemap This program downloads all data sources specified in the given semantic sitemap and saves it as a set of numbered files (1.rdf, 2.rdf,...) in a given subdirectory. Created by Martin Hepp on 2009-06-15. This software is free software under the LPGL. Acknowledgements: Some inspiration on using urllib came from Doug Hellmann, http://www.doughellmann.com/PyMOTW/urllib/ """ import urllib from xml.dom import minidom SITEMAP_URI = 'http://rdf4ecommerce.esolda.com/sitemap.xml' # insert URI of your sitemap here SEMSITEMAP_NS = 'http://sw.deri.org/2007/07/sitemapextension/scschema.xsd' RDF_DIRECTORY = 'rdfdownloads/' def get_listof_URIs(sitemapURI): """Extract a list of all data dump locations from the semantic sitemap at <sitemapURI>""" sitemap = urllib.urlopen(sitemapURI) dom = minidom.parse(sitemap) resources = [] elements = dom.getElementsByTagNameNS(SEMSITEMAP_NS,'dataDumpLocation') for location in elements: resources.append(str(location.firstChild.data)) return resources # Main uri_list = get_listof_URIs(SITEMAP_URI) counter = 1 total = len(uri_list) for address in uri_list: print "URI %d of %d: %s" % (counter, total, address) fname = RDF_DIRECTORY+str(counter)+".rdf" urllib.urlretrieve(address, fname) counter = counter + 1