· 6 years ago · Jul 24, 2019, 03:56 AM
1# -*- coding: utf-8 -*-
2
3
4'''
5edgelistとlabellistを0始まりのIDに変換する
6'''
7
8import os
9import sys
10#from snlocest.largedict import LargeDict
11
12
13def parse_args():
14 import argparse
15 parser = argparse.ArgumentParser()
16 parser.add_argument('mode', help='処理のモード')
17 parser.add_argument('--tablepath', required=True)
18 parser.add_argument('inputfiles', nargs='+')
19 return parser.parse_args()
20
21
22def load_table(filepath):
23 user2id = dict() #LargeDict()
24 with open(filepath, 'r') as fd:
25 for line in fd:
26 user_id, idx = line.rstrip().split('\t')
27 user2id[user_id] = int(idx)
28 idcnt = max(user2id.values()) + 1
29 return user2id, idcnt
30
31
32if __name__ == '__main__':
33 args = parse_args()
34
35 if args.mode == 'table':
36 # Generate convert table from inputfiles and save to tablepath
37 if os.path.exists(args.tablepath):
38 print('Table path already exists', file=sys.stderr)
39 sys.exit(1)
40 # create table
41 user2id = dict() #LargeDict()
42 idcnt = 0
43 for filepath in args.inputfiles:
44 with open(filepath, 'r') as inputfile:
45 for line in inputfile:
46 tokens = line.rstrip().split('\t')
47 user_id = tokens[0]
48 if user_id not in user2id:
49 user2id[user_id] = idcnt
50 idcnt += 1
51 user_id = tokens[1]
52 if user_id not in user2id:
53 user2id[user_id] = idcnt
54 idcnt += 1
55 # save table
56 with open(args.tablepath, 'w') as fd:
57 for k, v in user2id.items():
58 print(k, v, sep='\t', file=fd)
59
60 elif args.mode == 'edgelist':
61 # Convert edgelist using table
62 user2id, idcnt = load_table(args.tablepath)
63
64 for filepath in args.inputfiles:
65 with open(filepath, 'r') as fd:
66 for line in fd:
67 row = line.rstrip().split('\t')
68 src = row[0]
69 dst = row[1]
70 print(user2id[src], user2id[dst], *row[2:], sep='\t')
71
72 elif args.mode == 'label':
73 # Convert label file
74 user2id, idcnt = load_table(args.tablepath)
75
76 for filepath in args.inputfiles:
77 with open(filepath, 'r') as fd:
78 for line in fd:
79 row = line.rstrip().split('\t')
80 label = row[0]
81 print(user2id[label], *row[1:], sep='\t')
82
83 else:
84 print('Invalid mode. Choose "table" or "edgelist" or "label"', file=sys.stderr)