Unable to find source-code formatter for language: python. Available languages are: actionscript, html, java, javascript, none, sql, xhtml, xml
<Code sample>
#!/usr/bin/python
# -*- coding: utf-8 -*-
# columnar.py
# Luigi Messina - luigi.messina@unimib.it
# Biblioteca di Ateneo dell'Universita degli Studi di Milano Bicocca
#
# Converts from ALEPH500 XML-XSL catalog report columnar format to CSV (Comma
# Separated Values) to allow opening big file size reports into a spreadsheet
# application (i.e. OpenOffice Calc/Microsoft Excel) more easily.
# Copyright (c) 2010 Universita degli Studi di Milano Bicocca
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http:#
import mmap
import sys
import re
import csv
from os.path import exists
INIT_SECTION_MARKER = "<section-02>"
class OutputFileExistsError(Exception):
"""Custom exception to handle existing output file error"""
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def find_marker(filedesc, markdata, position):
"""Return section start marker position in bytes"""
return filedesc.find(markdata, position)
def process_section(section):
"""Scrape data from XML with a simple regular expression and return as list"""
line = ('').join(section)
try:
data = re.findall('<data-[1-9]>(.*?)<\/data-[1-9]>', line)
except:
raise
else:
return data
def process_section_first(section):
"""Scrape field names from XML and return as a list . This function is called only when reading first section block"""
line = ('').join(section)
try:
data = re.findall('<title-[1-9]>(.*?)<\/title-[1-9]>', line)
except:
raise
else:
return data
def read_blocks(infile):
"""Iterator function which returns the contents of a section"""
next_section = find_marker(infile, INIT_SECTION_MARKER, 0)
while next_section != -1:
section_start = next_section
section_end = find_marker(infile, INIT_SECTION_MARKER,
section_start + 1)
infile.seek(section_start)
block = infile.read(section_end - section_start)
yield block
next_section = find_marker(infile, INIT_SECTION_MARKER,
section_end)
def create_output_file(filename):
"""Creates output file avoiding overwrite"""
if exists(filename):
raise OutputFileExistsError('Output file %s already exists. I\'m not going to overwrite it.' %
filename)
else:
try:
ofile = open(filename, 'wb', -1)
csvfile = csv.writer(ofile, delimiter=';', lineterminator=
'\n')
except IOError:
raise IOError("Unable to open output file %s for writing. Permission denied?" %
filename)
else:
return (csvfile, ofile)
def usage():
"""Usage info"""
print("Usage: ./columnar <input file> [output file]")
print()
sys.exit()
def ver():
"""Version information"""
print("ALEPH XML-XSL catalog-report-columnar to CSV converter version 1.0")
print("Copyright (c) 2010 Universita degli Studi di Milano Bicocca")
print("This program is distributed in the hope that it will be useful,")
print("but WITHOUT ANY WARRANTY; without even the implied warranty of")
print("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the")
print("GNU General Public License for more details.")
print()
def main():
try:
inputfile = open((sys.argv)[1], 'r+', -1)
except IndexError:
usage()
except IOError:
print("Unable to open input file %s\nPlease check file name is correct." % \
(sys.argv)[1])
sys.exit()
else:
try:
outputfilename = '%s' % (sys.argv)[2]
except IndexError:
outputfilename = '%s.csv' % (sys.argv)[1].split('.')[0]
print("No outputfile specified. Using %s" % outputfilename)
(csvout, outputfile) = create_output_file(outputfilename)
ver()
mmapped_file = mmap.mmap(inputfile.fileno(), 0)
markerpos = find_marker(mmapped_file, INIT_SECTION_MARKER, 0)
if markerpos == -1:
print('Unable to find "%s" XML tag. Please verify %s is in ALEPH XML catalog columnar format' % \
(INIT_SECTION_MARKER, inputfile.name))
sys.exit()
sections_count = len(re.findall("<section-02>", mmapped_file))
print("%s contains %i sections to be processed" % (inputfile.name,
sections_count))
print("Processing records, please wait...")
linecount = 0
blockcount = 0
nsplit = 0
blocchi = read_blocks(mmapped_file)
basefilename = outputfile.name
for blocco in blocchi:
if blockcount == 0:
csvout.writerow(process_section_first(blocco))
csvout.writerow(process_section(blocco))
blockcount += 1
linecount += 1
if linecount >= 65534:
nsplit += 1
outputfile.flush()
outputfilename = "%s_%i.csv" % (basefilename.split('.')[0],
nsplit)
outputfile.close()
(csvout, outputfile) = create_output_file(outputfilename)
print("Splitting output file after 65.000 rows.\nAdditional output file %s" % \
outputfilename)
linecount = 0
if nsplit == 0:
print("%i section blocks read out of %i XML sections." % (blockcount,
sections_count))
else:
print("%i section blocks read out of %i XML sections.\nRows after 65534 have been written to the following additional output files:" % \
(blockcount, sections_count))
for nfile in range(1, nsplit + 1):
print("%s_%i.csv" % (basefilename.split('.')[0], nfile))
if __name__ == "__main__":
main()
</Code sample>