· 7 years ago · Feb 22, 2019, 10:46 PM
1use strict;
2use warnings;
3use utf8;
4use 5.012;
5
6sub parse_email {
7 my $email = shift;
8 my $pos = rindex($email, "@");
9
10 return (substr($email, 0, $pos), substr($email, $pos + 1));
11}
12
13sub is_gmail {
14 my $domain = shift;
15 state $gmail_domain = {
16 'gmail.com' => 1,
17 'googlemail.com' => 1,
18 };
19 return $gmail_domain->{$domain};
20}
21
22sub aggregated_gmail_account {
23 my $account = shift;
24
25 # gmailã¯å¤§æ–‡å—ã€å°æ–‡å—を区別ã—ãªã„
26 # gmailã¯[a-z0-9\.]{6-30} ã¨ã„ã†ã®ãŒãƒ«ãƒ¼ãƒ« +.. ãªã©ã¯æ¨ã¦ã‚‹
27 my ($base) = (lc($account) =~ /^([a-z0-9\.]+)/);
28
29 unless ($base) {
30 return "";
31 }
32
33 # gmailã¯.ã®æœ‰ç„¡ã¯åŒä¸€ã®ã‚‚ã®ã¨ã¿ãªã™
34 $base =~ s/\.//g;
35 return $base;
36}
37
38sub is_yahoomail {
39 my $domain = shift;
40 state $yahoo_domain = {
41 'yahoo.co.jp' => 1,
42 };
43 return $yahoo_domain->{$domain};
44}
45
46sub aggregated_yahoo_account {
47 my $account = shift;
48 # yahooã¯ã‚»ãƒ¼ãƒ•ティーアドレスã¨ã„ã†<basename>-<freeword> ã‚’è¨å®šã§ãã‚‹
49 # メインアカウントã¨ã‚»ãƒ¼ãƒ•ティアドレスã®basenameã®å寄ã›ã¯åˆ¤å®šã§ããªã„ãŒã€2アドレス以上ã¯ã‚¬ãƒ¼ãƒ‰ã§ãã‚‹
50 my ($base) = (lc($account) =~ /^([a-z0-9\_]+)/);
51 unless ($base) {
52 return "";
53 }
54 return $base;
55
56}
57
58sub aggregation {
59 my $email = shift;
60
61 my ($account, $domain) = parse_email($email);
62
63 if (is_gmail($domain)) {
64 return aggregated_gmail_account($account) . '@gmail.com';
65 }
66
67 if (is_yahoomail($domain)) {
68 return aggregated_yahoo_account($account) . '@yahoo.co.jp';
69 }
70
71 return $email;
72}
73
74my $count = {};
75my $lines = 0;
76
77open my $fh, "<", $ARGV[0];
78while (my $line = <$fh>) {
79 chomp $line;
80 ++$lines;
81 print STDERR "done $lines\n" if $lines % 100000 == 1;
82 my ($person, $email) = split "\t", $line;
83 next if $person eq 'person'; # header
84
85 my ($account, $domain) = parse_email($email);
86
87 my $aggregated;
88 if (is_gmail($domain)) {
89 if (my $a = aggregated_gmail_account($account)) {
90 $aggregated = $a . '@gmail.com';
91 }
92 else {
93 print STDERR "failed. $person, $email\n";
94 }
95 }
96 elsif (is_yahoomail($domain)) {
97 if (my $a = aggregated_yahoo_account($account)) {
98 $aggregated = $a . '@yahoo.co.jp';
99 }
100 else {
101 print STDERR "failed. $person, $email\n";
102 }
103
104 }
105 next unless $aggregated;
106
107 $count->{$aggregated} //= 0;
108 $count->{$aggregated}++;
109}
110close $fh;
111
112print STDERR "end read\n";
113
114for my $email (keys %$count) {
115 my $c = $count->{$email};
116 next if $c <= 1;
117 printf "%s\t%d\n", $email, $c;
118}