1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
* Extract volume units from strings.
* Using regexps in python.

* Sample data.
DATA LIST LIST /drink_vol (A33).
BEGIN DATA
'50 ml'
'Large Wine Glass (250 ml'
'500ml',
'Standard Wine Glass (175 ml',
'ml',
'Standard Wine Glass (175 ml',
'1 Litre',
'187ml mini wine bottle',
'Pint',
'1.5 Litre'
'Half Pint'
'Half-pint'
END DATA.

* Extracting volume units.
BEGIN PROGRAM Python.
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import spssdata, re

# The following 3 lines are overheads for export data to python and create new variable to 
# store extracted units
data = spssdata.Spssdata(indexes=('drink_vol'), accessType='w')
data.append(spssdata.vdef('vol', vlabel='Volume, ml',vfmt=['F', 8, 0]))
data.commitdict()

for i,row in enumerate(data):
    ml = None
    # Make list of search patterns, and the corresponding multipliers of found quantities, to convert them to ml
    patterns = [
        (r'(?P<number>\d+\.*\d*) *ml', 1),  # one or more digits grabbed into 'number', and 'ml' substring after zero or more spaces
        (r'(?P<number>\d+\.*\d*) *litre', 1000),  # same with litres (multiply by 1000),
        # Here you may add some more patterns...
        ]
    for pattern in patterns:
        volume = re.search(pattern[0], row.drink_vol.lower())
        if volume:
            ml = float(volume.group('number')) * pattern[1]
            break
    if not ml:
        if "pint" in row.drink_vol.lower():
            if "half" in row.drink_vol.lower():
                ml = 568.26 / 2
            else:
                ml = 568.26
    data.casevalues([ml])
data.CClose()
53
END PROGRAM.