FetchSitemapFiles

Tool: FetchSitemapFiles

Python script for the bulk download of RDF resources based on a Semantic Sitemap

Code

#!/usr/bin/env python
# encoding: utf-8
"""
FetchSitemapFiles.py

Bulk Download of RDF content using a Semantic Sitemap

This program downloads all data sources specified in the given semantic sitemap
and saves it as a set of numbered files (1.rdf, 2.rdf,...) in a given subdirectory.

Created by Martin Hepp on 2009-06-15.
This software is free software under the LPGL.

Acknowledgements: Some inspiration on using urllib came from Doug Hellmann,
http://www.doughellmann.com/PyMOTW/urllib/
"""

import urllib
from xml.dom import minidom

SITEMAP_URI = 'http://rdf4ecommerce.esolda.com/sitemap.xml' # insert URI of your sitemap here
SEMSITEMAP_NS = 'http://sw.deri.org/2007/07/sitemapextension/scschema.xsd'
RDF_DIRECTORY = 'rdfdownloads/'

def get_listof_URIs(sitemapURI):
    """Extract a list of all data dump locations from the semantic sitemap at <sitemapURI>"""
    sitemap = urllib.urlopen(sitemapURI)
    dom = minidom.parse(sitemap)
    resources = []
    elements = dom.getElementsByTagNameNS(SEMSITEMAP_NS,'dataDumpLocation')
    for location in elements:
        resources.append(str(location.firstChild.data))
    return resources

# Main
uri_list = get_listof_URIs(SITEMAP_URI)
counter = 1
total = len(uri_list)
for address in uri_list:
    print "URI %d of %d: %s" % (counter, total, address)
    fname = RDF_DIRECTORY+str(counter)+".rdf"
    urllib.urlretrieve(address, fname)
    counter = counter + 1