Ticket #6266: deanonymind.py

File deanonymind.py, 3.6 KB (added by karsten, 7 years ago)
Line 
1#!/usr/bin/env python
2import os
3import sys
4import zipfile
5
6"""
7Take a MaxMind GeoIP database as input and replace A1 entries with the
8country code and name of their predecessor iff the preceding (subsequent)
9entry ends (starts) directly before (after) the A1 entry and if both
10preceding and subsequent entries contain the same country code
11information.
12
13Usage:
14  python deanonymind.py [GeoIPCountryCSV.zip] [NewGeoIPCountryWhois.csv]
15"""
16def main():
17    (input_file, output_file) = parse_args()
18    output_lines = process_input_file(input_file)
19    write_output_file(output_file, output_lines)
20
21def parse_args():
22    if len(sys.argv) != 3:
23        print('Usage: python %s [GeoIPCountryCSV.zip] '
24              '[NewGeoIPCountryWhois.csv]' % (sys.argv[0], ))
25        sys.exit(1)
26    input_file = sys.argv[1]
27    if not os.path.exists(input_file) or not input_file.endswith('.zip'):
28        print 'Input file "%s" does not exist or is not a .zip file.' % (
29                input_file, )
30        sys.exit(1)
31    output_file = sys.argv[2]
32    return (input_file, output_file)
33
34def process_input_file(input_file):
35    result_lines = []
36    zip_file = zipfile.ZipFile(input_file)
37    csv_content = zip_file.read('GeoIPCountryWhois.csv')
38    prev_line = None
39    a1_lines = []
40    for line in csv_content.split('\n'):
41        if '"A1"' in line:
42            a1_lines.append(line)
43        else:
44            if len(a1_lines) > 0:
45                new_a1_lines = process_a1_lines(prev_line, a1_lines, line)
46                for new_a1_line in new_a1_lines:
47                    result_lines.append(new_a1_line)
48                a1_lines = []
49            result_lines.append(line)
50            prev_line = line
51    if len(a1_lines) > 0:
52        new_a1_lines = process_a1_lines(prev_line, a1_lines, None)
53        for new_a1_line in new_a1_lines:
54            result_lines.append(new_a1_line)
55    return result_lines
56
57def process_a1_lines(prev_line, a1_lines, next_line):
58    if not prev_line or not next_line:
59        # Can't merge first or last line in file.
60        print "Can't merge first or last line in file."
61        return a1_lines
62    if len(a1_lines) > 1:
63        # Can't merge more than 1 line at once.
64        print "Can't merge more than 1 line at once."
65        return a1_lines
66    a1_line = a1_lines[0].strip()
67    prev_entry = parse_line(prev_line)
68    a1_entry = parse_line(a1_line)
69    next_entry = parse_line(next_line)
70    touches_prev_entry = int(prev_entry['end_num']) + 1 == \
71            int(a1_entry['start_num'])
72    touches_next_entry = int(a1_entry['end_num']) + 1 == \
73            int(next_entry['start_num'])
74    same_country_code = prev_entry['country_code'] == \
75            next_entry['country_code']
76    if touches_prev_entry and touches_next_entry and same_country_code:
77        return format_line_with_other_country(a1_entry, prev_entry)
78    else:
79        return a1_lines
80
81def parse_line(line):
82    if not line:
83        return None
84    keys = ['start_str', 'end_str', 'start_num', 'end_num',
85            'country_code', 'country_name']
86    stripped_line = line.replace('"', '').strip()
87    parts = stripped_line.split(',')
88    entry = dict((k, v) for k, v in zip(keys, parts))
89    return entry
90
91def format_line_with_other_country(a1_entry, other_entry):
92    return ['"%s","%s","%s","%s","%s","%s"' % (a1_entry['start_str'],
93            a1_entry['end_str'], a1_entry['start_num'],
94            a1_entry['end_num'], other_entry['country_code'],
95            other_entry['country_name'], )]
96
97def write_output_file(output_file, lines):
98    out_file = open(output_file, 'w')
99    out_file.write('\n'.join(lines))
100    out_file.close()
101
102if __name__ == '__main__':
103    main()
104