This code sample checks python ParseTuple or BuildValue format strings, and reports anything it isn't sure is a proper match. Tweak the FILTER_* options, and values in ''fmtfunctions'' to taste. This really should be done with a proper C parser, but I was too far along before that became completely obvious.
Usage: ./fmtcheck.py <dir1> <dir2> ...
#! /usr/bin/env python
#
# A lame python format string checker...
# Copyright (C) 2005 Michael Urman
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
class NonLiteral(ValueError): pass
FILTER_UNSIGNED = False
FILTER_UNKNOWN = False
fmtfunctions = [
"PyArg_ParseTuple",
#"Py_BuildValue"
]
mapfmt = {
's': 'char',
'z': 'char',
'u': 'Py_UNICODE',
'e': 'char',
'#': 'int',
'b': 'char',
'B': 'unsigned char',
'h': 'short',
'H': 'unsigned short',
'i': 'int',
'I': 'unsigned int',
'l': 'long',
'L': 'PY_LONG_LONG',
'k': 'unsigned long',
'K': 'unsigned PY_LONG_LONG',
'c': 'char',
'f': 'float',
'd': 'double',
'D': 'Py_complex',
'O': 'PyObject',
'!': 'PyObject',
'&': '', # can't check void*
'N': 'PyObject',
'S': 'PyObject',
'U': 'PyObject',
't': 'char',
'w': 'char',
}
maptype = {
'const char': 'char',
'long int': 'long',
'unsigned long int': 'unsigned long',
'short int': 'short',
'unsigned short int': 'unsigned short',
'PyTypeObject': 'PyObject',
'PyUnicodeObject': 'PyObject',
'PyDateTime_DateType': 'PyObject',
'PyDateTime_TimeType': 'PyObject',
'PyDateTime_TZInfoType': 'PyObject',
'statichere PyTypeObject': 'PyObject',
'staticforward PyTypeObject': 'PyObject',
'PyCodeObject': 'PyObject',
'alcobject': 'PyObject',
'DBObject': 'PyObject',
'DBLockObject': 'PyObject',
'PyCursesWindowObject': 'PyObject',
'#define status_i': 'int'
}
knownobj = dict.fromkeys("""
PyCode_Type
PyDict_Type
PyString_Type
PyUnicode_Type
PyList_Type
PyTuple_Type
PyFile_Type
PyInt_Type
PySocketModule.Sock_Type
PyCursesWindow_Type
self
""".strip().split(), 'PyObject')
knownobj.update(dict(
Py_FileSystemDefaultEncoding='char',
))
def findfmt(line):
for func in fmtfunctions:
i = line.find(func)
if i >= 0: return i
return i
def checkfile(name):
from os.path import basename
lines = [line.strip() for line in open(name)]
kill_comments(lines)
important = [i for (i, line) in enumerate(lines) if findfmt(line)>=0]
for i in important:
fmt = ''
try: fmt, variables = getfmtvars(lines, i)
except NonLiteral: continue
except Exception, err: print 'skipping %s:%d %s' % (name, i+1, err)
if not fmt: continue
types = getvartypes(lines, i, variables)
fakefmt = fmt
for c in '()[]|': fakefmt = fakefmt.replace(c, '')
if ':' in fakefmt: fakefmt = fakefmt[:fakefmt.find(':')]
if ';' in fakefmt: fakefmt = fakefmt[:fakefmt.find(';')]
fakefmt = fakefmt.replace('O&', '&&') # can't check function or void*
#print i, zip(types, fakefmt, variables)
for t, f, v in zip(types, fakefmt, variables):
checkfmt = mapfmt.get(f)
checktype = maptype.get(t, t)
if FILTER_UNKNOWN and checktype == 'unknown': continue
if FILTER_UNSIGNED:
if checkfmt.startswith('unsigned '): checkfmt = checkfmt[9:]
if checktype.startswith('unsigned '): checktype = checktype[9:]
if checkfmt != '' and checkfmt != checktype:
print "%-20s%40s >< %s" % (
"%s:%d" % (basename(name), i+1),
" `%s' type `%s'" % (v.strip('&'), t),
"%s: %s" % (f, checkfmt))
def kill_comments(lines):
in_comment = False
for i, line in enumerate(lines):
line.replace('\\','')
if in_comment:
be = line.find('*/')
if be >= 0:
line = line[be+2:]
in_comment = False
else:
line = ''
lines[i] = line
while True:
bs = line.find('/*')
be = line.find('*/')
ls = line.find('//')
if bs >= 0:
if ls >= 0 and ls < bs:
line = line[:ls]
elif be >= 0:
line = line[:bs] + line[be+2:]
else:
line = line[:bs]
in_comment = True
elif ls >= 0:
line = line[:ls]
else:
break
lines[i] = line
def getfmtvars(lines, i):
content = lines[i][findfmt(lines[i]):]
left = content.find('(')
right = left + 1
count = 1
nextline = i+1
while count > 0:
if len(content) == right:
content += ' ' + lines[nextline]
nextline += 1
if content[right] == '(': count += 1
elif content[right] == ')': count -= 1
right += 1
content = content[:right]
args = map(str.strip, content[left+1:right-1].split(','))
if args[0].startswith('"'):
fmt = args[0]
args = args[1:]
elif args[1].startswith('"'):
fmt = args[1]
args = args[2:]
elif len(args) > 2 and args[2].startswith('"'):
fmt = args[2]
args = args[4:]
else:
raise NonLiteral("unrecognized format string")
if ' ' in fmt and fmt.find('#') > fmt.find(' '): fmt = fmt[:fmt.find(' ')] + '"'
while not fmt.endswith('"'):
fmt += "," + args.pop(0)
fmt = fmt[1:-1]
if ':' in fmt: fmt = fmt[:fmt.find(':')]
if ';' in fmt: fmt = fmt[:fmt.find(';')]
return fmt, args
def getvartypes(lines, i, variables):
types = {}
typelist = []
for var in variables:
var = var.strip()
if var.startswith('('):
types[var] = var[1:var.find(')')].strip('*').strip()
elif var.startswith('"'):
types[var] = 'char'
elif var.strip('&') in knownobj:
types[var] = knownobj[var.strip('&')]
elif '.' in var or '->' in var:
types[var] = 'unknown'
elif var.startswith('ntohs(') or var.startswith('htons('):
types[var] = 'short'
elif var.startswith('ntohl(') or var.startswith('htonl('):
types[var] = 'long'
else:
if var.startswith('&'): var = var[1:]
if '[' in var: var = var[:var.find('[')]
j = 1
for j in range(1,i+1):
line = lines[i-j]
if '[]' not in line and var in line:
fakeline = ' ' + line + ' '
for c in '()[],;+-*/=': fakeline = fakeline.replace(c, ' ')
if (' ' + var + ' ') in fakeline:
if '(' in line and '=' not in line:
line = line[line.find('('):]
for c in '=+-/^&!~()[]': line = line.replace(c, ',')
pieces = [piece.strip() for piece in line.split(",")]
decl = pieces[0].split()
if len(decl) > 1:
maybe = " ".join(decl[:-1]).strip('*')
if maybe != 'return' and '"' not in maybe:
types[var] = maybe
break
else:
types[var] = 'unknown'
typelist.append(types.get(var, 'unknown'))
if typelist[-1].startswith('static '): typelist[-1] = types[var][7:]
return typelist
if __name__ == '__main__':
import sys, os
for basedir in sys.argv[1:]:
for dirpath, dirnames, filenames in os.walk(basedir):
for name in filenames:
if name.endswith('.c') or name.endswith('.h'):
checkfile(os.path.join(dirpath, name))