This is a sample XML file to read by this script: Read Sample XML file

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#Python script that reads and converts .xml file into SPSS .sav file

# Reference Hetland p. 427 (Beginning Python From Novice to Professional)
# Other resource: Chap 24 of Python in a Nutshel (Alex Martelli)
# Raynald Levesque August 2008

from xml.sax.handler import ContentHandler
from xml.sax import parse
import spss

# Change next line to match your requirements
fpath=r'C:/Test2'
fname=r'sample.xml'     # the enclosed sample data file


xmlFile  = r'%(fpath)s/%(fname)s' % vars()

class DataHandler(ContentHandler):
    """ Creates tab separated data files """

    in_extract = False

    def __init__(self, extract, wantList):
        ContentHandler.__init__(self)
        self.extract = extract
        self.data = []
        self.wantList = wantList
        # next 2 strings were copy/pasted from the XML document then 'cleaned'
        self.action_logAtt = 'id team1 team1_name team2 team2_name league league_id \
            date matchday season season_code start1 start2'.split()
        self.actionAtt = 'aid action_code activitytype result id minute second \
            field_position receiver team_id x y z pace last_modified'.split()
        self.DeletedactionAtt = self.actionAtt
        # next string was found inspection of XML file
        self.actionOpt ='subtype c1 c2 c3'.split()
        self.first_Action = True
        self.first_Log = True

    def startElement(self, name, attrs):
        self.in_extract = name in self.wantList
        #print name, attrs.keys()
        if self.in_extract:
            if name== 'action_log':
                self.data=["1\t"]                           #record type 1
                self.data.extend(["%s\t" % attrs.getValue(att) for att in self.action_logAtt])
                self.action_logID=attrs.getValue('id')      # will be added at beg of each child record

            elif name == 'action':
                self.data=["2\t", self.action_logID + '\t'] #record type 2
                self.dataOpt=['\t']*len(self.actionOpt)
                for idx,attrib in enumerate(self.actionOpt):
                    if attrs.has_key(attrib):
                        self.dataOpt[idx] = attrs.getValue(attrib) + '\t'
                self.data.extend(["%s\t" % attrs.getValue(att) for att in self.actionAtt])
                self.data.extend(self.dataOpt )

            elif name == 'Deletedaction':
                self.data=["3\t", self.action_logID + '\t'] #record type 3
                self.data.extend(["%s\t" % attrs.getValue(att) for att in self.DeletedactionAtt])

            # Insert var names in tab delimited file
            if self.first_Log:
                # extract[0] will contain both var names and attributes of 1st XML element
                vnames=['recType']
                vnames.extend(self.action_logAtt)
                vnames=["%s\t" % v for v in vnames]
                vnames.append('\n')
                vnames.extend(self.data)
                self.data=vnames
                self.first_Log = False
                # extract[1] will contain both var names and attributes of 2nd XML element
            elif self.first_Action and name in ['action','Deletedaction']:
                vnames=['recType','logID']
                vnames.extend(self.actionAtt)
                vnames.extend(self.actionOpt)
                vnames=["%s\t" % v for v in vnames]
                vnames.append('\n')
                vnames.extend(self.data)       # this is an in-place modif
                self.data = vnames
                self.first_Action = False      # we won't come back through this if

            text = ''.join(self.data) + '\n'
            self.extract.append(text)

    def endElement(self,name):
        if name in self.wantList:
            self.data = []
            self.in_extract = False

    def characters(self,string):
        if self.in_extract:
            self.data.append(string)


# Use the class to create the tab separated data file
# Note: If we were dealing with a very large file, it would be preferable to create
# the 2 text files within the DataHandler class

extract = []
wantList=['action_log','action','Deletedaction']        # Elements to extract
parse(xmlFile, DataHandler(extract, wantList))          # extract now contain the data
nameroot = fname[:fname.find('.')]                      #sample.xml --> sample
fLogName = r'%(fpath)s/%(nameroot)sLog' % vars()        # --> path/sampleLog
fActionName = r'%(fpath)s/%(nameroot)sAction' % vars()  #--> path/sampleAction
fLog = open(fLogName,'w')                   #File to contain action_log info
fAction = open(fActionName,'w')
try:
    for (i,s) in enumerate(extract):
        if len(s)> 0:
            s2 = s.encode('iso8859-1')      #unicode must be encoded before writing
            if s[0] in ['2','3'] or i==1:   #2nd line contains vnames of action
                fAction.write(s2)
            elif s[0] == '1' or i == 0:     #1st line contains vnames of log
                fLog.write(s2)
finally:
    fAction.close()
    fLog.close()

# The 2 tab delimited text files were then read using SPSS and the syntax was
# pasted below

cmd=r"""
    SET PRINTBACK=YES /MPRINT=YES.
    DATASET CLOSE ALL.
    GET DATA  /TYPE = TXT
     /FILE = "%(fLogName)s.txt"
     /DELCASE = LINE
     /DELIMITERS = "\t"
     /ARRANGEMENT = DELIMITED
     /FIRSTCASE = 2
     /IMPORTCASE = ALL
     /VARIABLES =
     recType F1.0
     id F6.0
     team1 F3.0
     team1_name A19
     team2 F3.0
     team2_name A16
     league A14
     league_id F1.0
     date A22
     matchday F2.0
     season A9
     season_code F2.0
     start1 A22
     start2 A22.
    CACHE.
    SAVE OUTFILE= "%(fLogName)s.sav".

    GET DATA  /TYPE = TXT
     /FILE = "%(fActionName)s.txt"
     /DELCASE = LINE
     /DELIMITERS = "\t"
     /ARRANGEMENT = DELIMITED
     /FIRSTCASE = 2
     /IMPORTCASE = ALL
     /VARIABLES =
     recType F1.0
     logID F6.0
     aid F7.0
     action_code A4
     activitytype F2.0
     result F1.0
     id F2.0
     minute F2.0
     second F2.0
     field_position F2.0
     receiver F5.0
     team_id F3.0
     x F5.3
     y F5.3
     z F5.3
     pace F5.3
     last_modified A22
     subtype F2.0
     c1 F1.0
     c2 F1.0
     c3 F1.0 .
    CACHE.
    SAVE OUTFILE="%(fActionName)s.sav".
""" % vars()

spss.Submit(cmd)