· 7 years ago · Nov 15, 2018, 03:44 AM
1#!/bin/bash
2
3################################################################################
4#
5# Generates data for the 'TripWatch' campaign
6#
7# @author cmundhe
8# @since 2014-03-15
9#
10################################################################################
11
12# ==============================================================================
13# Setup
14# ==============================================================================
15
16# WARN: If the $LIVE flag is passed as true (which should only be the case for
17# live sends of the campaign) AND the send fails in the Java subscription sender
18# phase, DO NOT RE-RUN this script! For every live data gen, a_trending_history
19# gets populated with trending hotels to suppress for the next month. If live data
20# is re-generated after a failure, there's a risk that legitimate trending
21# properties will get accidentally suppressed.
22
23set -e
24
25usage="Usage: hive_2_et_tripwatch.bash <WHTOP> <CLUSTER> <TRTOP> [SEND_DATE] [LIVE=0|1] [WAIT_MINUTES]"
26
27if [ $# -lt 3 ]; then
28 echo -e ${usage}
29 exit 1
30fi
31
32WHTOP=$1
33CLUSTER=$2
34TRTOP=$3
35SEND_DATE=$4
36if [ -z "$SEND_DATE" ]; then
37 SEND_DATE=$(date +%F)
38fi
39
40LIVE=$5
41
42WAIT_MINUTES=$6
43if [ -z "$WAIT_MINUTES" ]; then
44 WAIT_MINUTES=30
45fi
46
47. $WHTOP/common/config/env.bash $WHTOP/clusters/$CLUSTER
48. $WHTOP/scripts/crm/common/utils.bash
49. $WHTOP/scripts/crm/common/constants.bash
50
51
52# ==============================================================================
53# Constants
54# ==============================================================================
55
56EMAIL_NAME_ID=13043
57TODAY=$(date -d 'today' +%F)
58WEEK_OF_YEAR=$(date -d 'today' +%W)
59ONE_WEEK_AGO=$(date -d '1 week ago' +%F)
60ACTIVE_SUBSCRIPTION_CUTOFF_DATE="2013-11-01"
61NEW_SUBSCRIPTION_CUTOFF_DATE=$ONE_WEEK_AGO
62TMP_PREFIX=${WORKING_DB_NAME}.tmp_tripwatch
63
64CREATIVE_TYPE_OVERVIEW="overview"
65CREATIVE_TYPE_TRENDING="trending"
66CREATIVE_TYPE_TOP_LIST="top_list"
67
68# ------------------------------------------------------------------------------
69# Per-geo content
70# ------------------------------------------------------------------------------
71
72# Shared
73TBL_WATCHED_LOCATIONS_BASE=${TMP_PREFIX}_watched_locations_base
74TBL_WATCHED_LOCATIONS=${TMP_PREFIX}_watched_locations
75TBL_ELIGIBLE_GEOS=${TMP_PREFIX}_eligible_geos
76TBL_ELIGIBLE_PROPERTIES=${TMP_PREFIX}_eligible_properties
77TBL_ELIGIBLE_PROPERTY_REVIEWS=${TMP_PREFIX}_eligible_property_reviews
78TBL_ELIGIBLE_PROPERTY_REVIEW_COUNTS=${TMP_PREFIX}_eligible_property_review_counts
79TBL_UNIQUE_VIEWS=${TMP_PREFIX}_unique_views
80TBL_MEMBER_PROPERTY_VIEWS=${TMP_PREFIX}_member_property_views
81TBL_UNIQUE_PROPERTY_VIEWS=${TMP_PREFIX}_unique_property_views
82
83# Geo details
84TBL_GEO_DETAILS=${TMP_PREFIX}_geo_details
85
86# Top lists
87TBL_ELIGIBLE_RESTAURANT_FEATURES=${TMP_PREFIX}_eligible_restaurant_features
88TBL_ELIGIBLE_ATTRACTION_CATEGORIES=${TMP_PREFIX}_eligible_attraction_categories
89TBL_TOP_LISTS_AGGREGATE=${TMP_PREFIX}_top_lists_aggregate
90TBL_TOP_LISTS=${TMP_PREFIX}_top_lists
91
92# Trending
93TBL_TRENDING=${TMP_PREFIX}_trending
94
95# ------------------------------------------------------------------------------
96# Per-member-and-geo content
97# ------------------------------------------------------------------------------
98
99# Send list
100TBL_RECIPIENTS=${TMP_PREFIX}_recipients
101TBL_RECIPIENT_USERNAMES=${TMP_PREFIX}_recipient_usernames
102TBL_PRELIMINARY_SEND_LIST=${TMP_PREFIX}_preliminary_send_list
103TBL_SEND_LIST=${TMP_PREFIX}_send_list
104TBL_TEST_SEND_LIST=${TMP_PREFIX}_test_send_list
105
106# Featured hotels
107TBL_RECENTLY_VIEWED_HOTELS=${TMP_PREFIX}_recently_viewed_hotels
108TBL_RECENTLY_VIEWED_HOTELS_WITH_NEW_REVIEW=${TMP_PREFIX}_recently_viewed_hotels_with_new_review
109TBL_RECENTLY_VIEWED_HOTELS_WITHOUT_NEW_REVIEW=${TMP_PREFIX}_recently_viewed_hotels_without_new_review
110TBL_RECOMMENDED_HOTELS=${TMP_PREFIX}_recommended_hotels
111TBL_POPULAR_HOTELS_WITH_NEW_REVIEWS=${TMP_PREFIX}_popular_hotels_with_new_reviews
112TBL_HOTELS_WITH_NEW_REVIEWS=${TMP_PREFIX}_hotels_with_new_reviews
113TBL_FEATURED_HOTELS_AGGREGATE=${TMP_PREFIX}_featured_hotels_aggregate
114TBL_FEATURED_HOTELS=${TMP_PREFIX}_featured_hotels
115
116# Friend activity
117TBL_RECENT_ACTIVITY=${TMP_PREFIX}_recent_activity
118TBL_FRIEND_ACTIVITY=${TMP_PREFIX}_friend_activity
119
120# Airfare
121TBL_AIRFARE=${TMP_PREFIX}_airfare
122
123# Special offers
124TBL_SPECIAL_OFFERS=${TMP_PREFIX}_special_offers
125
126# Featured rentals
127TBL_RENTALS=${TMP_PREFIX}_rentals
128TBL_FEATURED_RENTALS=${TMP_PREFIX}_featured_rentals
129
130
131# ==============================================================================
132# Functions
133# ==============================================================================
134
135function verify_dependencies
136{
137 local PREV_DAY_TABLES=(
138 f_ech_daily
139 a_blessed_sessions_with_lookback
140 )
141
142 local SAME_DAY_TABLES=(
143 a_geo_coupons
144 a_member_origin
145 a_recent_activity
146 a_ta_friendmap
147 crm_locale_maps
148 crm_recommended_hotels_exploded
149 d_email_type
150 # f_airwatch_raw_data
151 # fb_global_details
152 t_airportcodes
153 t_attraction
154 t_currency
155 t_eatery_data
156 t_eateryfeatures
157 t_external_member
158 t_location
159 t_locationpaths
160 t_locationpopularity
161 t_locationratings
162 t_nearestairports
163 t_subscriptions
164 t_userreview
165 t_vacationrentaldata
166 t_watched_location
167 # vw_nonremovedlocation
168 vw_t_member
169 )
170
171 verify_standard_dependencies $(echo ${PREV_DAY_TABLES[@]} | tr ' ' ',') $(echo ${SAME_DAY_TABLES[@]} | tr ' ' ',') $TODAY $WAIT_MINUTES
172}
173
174function create_watched_locations
175{
176 local CREATIVE_TYPE
177 if [ $(( $(date +%V) % 2 )) = 0 ]; then
178 CREATIVE_TYPE="${CREATIVE_TYPE_TOP_LIST}"
179 else
180 CREATIVE_TYPE="${CREATIVE_TYPE_TRENDING}"
181 fi
182
183 # Eliminate TripWatch subscriptions for invalid geos as well as if expired
184 hive_command "CREATE TABLE IF NOT EXISTS $TBL_WATCHED_LOCATIONS_BASE
185 (
186 member_id INT,
187 geo_id INT,
188 creative_type STRING
189 )
190 COMMENT 'watched locations'"
191
192 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_WATCHED_LOCATIONS_BASE
193
194 SELECT
195 a.memberid AS member_id,
196 a.locationid AS geo_id,
197 -- Show 'overview' creative in the first week of a new subscription
198 if(a.creationdate >= '$NEW_SUBSCRIPTION_CUTOFF_DATE', '${CREATIVE_TYPE_OVERVIEW}', '${CREATIVE_TYPE}') AS creative_type
199
200 FROM t_watched_location a
201
202 -- Filter out Countries, Nations, Narrow Virtual Regions, Broad Virtual Regions, and Narrow Virtual Region Leaves
203 JOIN vw_nonremovedlocation b
204 ON a.locationid = b.id
205 AND NOT b.placetypeid IN (10001, 10002, 10032, 10033, 10034)
206
207 -- Filter by subscription date range and type
208 WHERE a.creationdate >= '$ACTIVE_SUBSCRIPTION_CUTOFF_DATE'
209 AND (a.expirationdate IS NULL OR a.expirationdate = '' OR '$SEND_DATE' < a.expirationdate)
210 AND a.subscriptiontypeid & 31 != 0"
211
212
213 # Members might have subscriptions to e.g. a city as well as the state that
214 # city is in. Ensure that only the most specific subscriptions e.g. the city
215 # is retained.
216 hive_command "CREATE TABLE IF NOT EXISTS $TBL_WATCHED_LOCATIONS LIKE $TBL_WATCHED_LOCATIONS_BASE"
217
218 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_WATCHED_LOCATIONS
219 SELECT a.*
220 FROM $TBL_WATCHED_LOCATIONS_BASE a
221 LEFT OUTER JOIN (
222 SELECT DISTINCT
223 a.member_id,
224 c.geo_id
225 FROM $TBL_WATCHED_LOCATIONS_BASE a
226 JOIN t_locationpaths b
227 ON a.geo_id = b.locationid
228 AND b.pathtype = 1
229 AND b.rank < b.maxrank
230 JOIN $TBL_WATCHED_LOCATIONS_BASE c
231 ON a.member_id = c.member_id
232 AND b.parentid = c.geo_id
233 ) b
234 ON a.member_id = b.member_id
235 AND a.geo_id = b.geo_id
236 WHERE b.member_id IS NULL"
237}
238
239# Normalize all watched locations to their nearest city ancestor, since many
240# non-geos or non-cities are in $TBL_WATCHED_LOCATIONS.
241# NOTE: This unfortunately excludes weirdly categorized geos such as Tokyo from
242# the TripWatch campaign. Might make sense to allow certain types of geos other
243# than cities.
244function create_eligible_geos
245{
246 hive_command "CREATE TABLE IF NOT EXISTS $TBL_ELIGIBLE_GEOS
247 (
248 geo_id INT,
249 geo_name STRING
250 )
251 COMMENT 'eligible geos'"
252
253 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_ELIGIBLE_GEOS
254 SELECT
255 a.geo_id,
256 b.primaryname AS geo_name
257 FROM (
258 SELECT DISTINCT geo_id
259 FROM $TBL_WATCHED_LOCATIONS
260 ) a
261 JOIN vw_nonremovedlocation b
262 ON a.geo_id = b.id"
263}
264
265function create_eligible_properties
266{
267 hive_command "CREATE TABLE IF NOT EXISTS $TBL_ELIGIBLE_PROPERTIES
268 (
269 geo_id INT,
270 property_id INT,
271 property_type_id INT
272 )
273 COMMENT 'eligible properties'"
274
275 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_ELIGIBLE_PROPERTIES
276 SELECT
277 a.geo_id,
278 c.id AS property_id,
279 c.placetypeid AS property_type_id
280 FROM $TBL_ELIGIBLE_GEOS a
281 JOIN t_locationpaths b
282 ON a.geo_id = b.parentid
283 AND b.pathtype = 1
284 AND b.rank < b.maxrank
285 JOIN vw_nonremovedlocation c
286 ON b.locationid = c.id
287 AND c.placetypeid IN ($PLACE_TYPE_ATTRACTION, $PLACE_TYPE_EATERY, $PLACE_TYPE_ACCOMMODATION)
288 -- Filter out VRs (accommodation types 33-39)
289 LEFT OUTER JOIN t_accomodation d ON d.locationid = c.id AND NOT d.whattype IN (33,34,35,36,37,38,39)
290 -- Filter out closed locations
291 LEFT OUTER JOIN t_location_closing_info e on e.locationid = c.id
292 WHERE (c.placetypeid != $PLACE_TYPE_ACCOMMODATION OR d.locationid IS NOT NULL)
293 AND e.locationid IS NULL"
294}
295
296function create_eligible_property_review_counts
297{
298 hive_command "CREATE TABLE IF NOT EXISTS ${TMP_PREFIX}_eligible_properties_uniq (locationid int)"
299 hive_command_compress_limit "INSERT OVERWRITE TABLE ${TMP_PREFIX}_eligible_properties_uniq SELECT DISTINCT property_id FROM $TBL_ELIGIBLE_PROPERTIES"
300
301 hive_command "CREATE TABLE IF NOT EXISTS $TBL_ELIGIBLE_PROPERTY_REVIEW_COUNTS (locationid int, num_reviews int)"
302 return_reviews_for_locations ${TMP_PREFIX}_eligible_properties_uniq $TBL_ELIGIBLE_PROPERTY_REVIEW_COUNTS
303}
304
305function create_eligible_property_reviews
306{
307 # Get all valid reviews for eligible properties
308 hive_command "CREATE TABLE IF NOT EXISTS $TBL_ELIGIBLE_PROPERTY_REVIEWS
309 (
310 geo_id INT,
311 property_id INT,
312 property_type_id INT,
313 review_id INT,
314 review_lang STRING,
315 review_title STRING,
316 review_rating STRING,
317 review_published STRING
318 )
319 COMMENT 'eligible property reviews'"
320
321 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_ELIGIBLE_PROPERTY_REVIEWS
322 SELECT
323 a.geo_id,
324 a.property_id,
325 a.property_type_id,
326 b.id AS review_id,
327 substr(b.lang, 0, 2) AS review_lang,
328 substr(b.title, 0, 100) AS review_title,
329 b.rating AS review_rating,
330 b.tspublished AS review_published
331 FROM $TBL_ELIGIBLE_PROPERTIES a
332 JOIN t_userreview b
333 ON a.property_id = b.locationid
334 AND b.memberid > 0
335 AND length(trim(b.title)) >= 2
336 AND b.rating > 0
337 AND b.providerid = 0
338 AND b.pagetype = 1
339 AND b.status & 6 = 4
340 AND b.tspublished > ''"
341}
342
343function create_unique_property_views
344{
345 local PAGEVIEW_LOOKBACK_DATE=$(date -d '2 weeks ago' +%F)
346
347 hive_command "CREATE TABLE IF NOT EXISTS $TBL_UNIQUE_VIEWS
348 (
349 cookie STRING,
350 userid STRING,
351 location_id INT,
352 week INT
353 )
354 COMMENT 'unique views'"
355
356 hive_command_compress_limit_both "set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
357 INSERT OVERWRITE TABLE $TBL_UNIQUE_VIEWS
358 SELECT DISTINCT
359 cookie,
360 userid,
361 location_id,
362 week
363 FROM (
364 SELECT
365 ta_persistentcookie AS cookie,
366 user_id AS userid,
367 location_id,
368 if(ds >= '$ONE_WEEK_AGO', 1, 2) AS week
369 FROM a_blessed_sessions_with_lookback
370 WHERE ds >= '$PAGEVIEW_LOOKBACK_DATE'
371 AND ta_persistentcookie IS NOT NULL
372 AND location_id > 1
373 ) a"
374
375 hive_command "CREATE TABLE IF NOT EXISTS $TBL_MEMBER_PROPERTY_VIEWS
376 (
377 userid STRING,
378 geo_id INT,
379 property_id INT,
380 property_type_id INT,
381 week INT
382 )
383 COMMENT 'member property views'"
384
385 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_MEMBER_PROPERTY_VIEWS
386 SELECT
387 b.userid,
388 a.geo_id,
389 a.property_id,
390 a.property_type_id,
391 b.week
392 FROM $TBL_ELIGIBLE_PROPERTIES a
393 JOIN (
394 SELECT DISTINCT userid, location_id, week
395 FROM $TBL_UNIQUE_VIEWS
396 WHERE userid IS NOT NULL
397 ) b on a.property_id = b.location_id"
398
399 hive_command "CREATE TABLE IF NOT EXISTS $TBL_UNIQUE_PROPERTY_VIEWS
400 (
401 geo_id INT,
402 property_id INT,
403 property_type_id INT,
404 week INT,
405 view_count INT
406 )
407 COMMENT 'unique property views'"
408
409 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_UNIQUE_PROPERTY_VIEWS
410 SELECT
411 a.geo_id,
412 a.property_id,
413 a.property_type_id,
414 b.week,
415 count(*) as view_count
416 FROM $TBL_ELIGIBLE_PROPERTIES a
417 JOIN $TBL_UNIQUE_VIEWS b
418 ON a.property_id = b.location_id
419 GROUP BY a.geo_id, a.property_id, a.property_type_id, b.week"
420}
421
422function load_weather_data
423{
424 import_tab_data_to_hadoop $TRDATA/weather/forecasts.csv weather_forecast_data
425
426 WEATHER_FIELDS="locationid int,partnerid string,gmtoffset int"
427 WEATHER_CLAUSE=""
428
429 for i in `seq 0 4`; do
430 WEATHER_FIELDS="${WEATHER_FIELDS},date${i} string,low${i} int,high${i} int,icon${i} string"
431 WEATHER_CLAUSE="${WEATHER_CLAUSE} SELECT locationid, date${i} as forecast_date, low${i} as low, high${i} as high FROM crm_weather_forecast_data"
432 if [ $i -lt 4 ]; then
433 WEATHER_CLAUSE="${WEATHER_CLAUSE} UNION ALL"
434 fi
435 done
436
437 echo "WEATHER_FIELDS = $WEATHER_FIELDS"
438 echo "WEATHER_CLAUSE= $WEATHER_CLAUSE"
439
440 load_formatted_external_data_to_hive weather_forecast_data crm_weather_forecast_data "$WEATHER_FIELDS" "weather data for crm usage" ","
441
442 hive_command "CREATE TABLE IF NOT EXISTS crm_weather_data (locationid int, forecast_date string, low int, high int) COMMENT 'geo forecast high low temps in f'"
443
444 hive_command_compress_limit "INSERT OVERWRITE TABLE crm_weather_data
445 SELECT locationid,
446 -- dates from file are MM/DD/YYYY - convert to YYYY-MM-DD
447 concat(substr(forecast_date, 7, 4), '-', substr(forecast_date, 1, 2), '-', substr(forecast_date, 4, 2)),
448 low, high FROM (
449 $WEATHER_CLAUSE
450 ) A;"
451}
452
453function create_geo_details
454{
455 local NEW_CONTENT_LOOKBACK_DATE=$ONE_WEEK_AGO
456
457 hive_command "CREATE TABLE IF NOT EXISTS ${TMP_PREFIX}_new_review_counts (property_id int, geo_id int, property_type_id int, num_reviews int)"
458
459 hive_command_compress_limit "INSERT OVERWRITE TABLE ${TMP_PREFIX}_new_review_counts
460 SELECT a.property_id, a.geo_id, a.property_type_id, count(distinct ur.id) AS num_reviews
461 FROM (SELECT DISTINCT property_id, geo_id, property_type_id FROM $TBL_ELIGIBLE_PROPERTIES) a
462 JOIN t_userreview ur ON (a.property_id = ur.locationid)
463 WHERE ur.status = 4
464 AND ur.pagetype = 1
465 AND ur.tspublished IS NOT NULL AND tspublished >= '$NEW_CONTENT_LOOKBACK_DATE'
466 GROUP BY a.property_id, a.geo_id, a.property_type_id;"
467
468 hive_command "CREATE TABLE IF NOT EXISTS $TBL_GEO_DETAILS
469 (
470 geo_id INT,
471 send_date STRING,
472 new_view_count INT,
473 new_forum_topic_count INT,
474 new_review_count INT,
475 new_hotel_review_count INT,
476 new_restaurant_review_count INT,
477 new_attraction_review_count INT,
478 hotel_count INT,
479 restaurant_count INT,
480 attraction_count INT,
481 special_offer_count INT,
482 rental_count INT,
483 temp_f_high INT,
484 temp_f_low INT,
485 temp_c_high INT,
486 temp_c_low INT
487 )
488 COMMENT 'geo details'"
489
490 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_GEO_DETAILS
491 SELECT
492 a.geo_id AS geo_id,
493 '$SEND_DATE' AS send_date,
494 COALESCE(cast(b.new_view_count AS INT), 0) AS new_view_count,
495 COALESCE(cast(c.new_forum_topic_count AS INT), 0) AS new_forum_topic_count,
496 COALESCE(cast(d.new_review_count AS INT), 0) AS new_review_count,
497 COALESCE(cast(d.new_hotel_review_count AS INT), 0) AS new_hotel_review_count,
498 COALESCE(cast(d.new_restaurant_review_count AS INT), 0) AS new_restaurant_review_count,
499 COALESCE(cast(d.new_attraction_review_count AS INT), 0) AS new_attraction_review_count,
500 COALESCE(cast(e.hotel_count AS INT), 0) AS hotel_count,
501 COALESCE(cast(e.restaurant_count AS INT), 0) AS restaurant_count,
502 COALESCE(cast(e.attraction_count AS INT), 0) AS attraction_count,
503 COALESCE(cast(f.special_offer_count AS INT), 0) AS special_offer_count,
504 COALESCE(cast(g.rental_count AS INT), 0) AS rental_count,
505 h.high AS temp_f_high,
506 h.low AS temp_f_low,
507 round((h.high-32)/1.8) AS temp_c_high,
508 round((h.low-32)/1.8) AS temp_c_low
509
510 FROM $TBL_ELIGIBLE_GEOS a
511
512 -- Get unique property view count
513 LEFT OUTER JOIN (
514 SELECT geo_id, count(distinct cookie) AS new_view_count
515 FROM $TBL_UNIQUE_VIEWS a
516 JOIN $TBL_ELIGIBLE_PROPERTIES b on a.location_id = b.property_id
517 WHERE week = 1
518 GROUP BY geo_id
519 ) b
520 ON a.geo_id = b.geo_id
521
522 -- Get forum topics from the past week
523 LEFT OUTER JOIN (
524 SELECT
525 a.geo_id,
526 count(*) as new_forum_topic_count
527 FROM $TBL_ELIGIBLE_GEOS a
528 JOIN t_forum b
529 ON a.geo_id = b.location_id
530 AND b.status = 0
531 JOIN t_topic c
532 ON b.id = c.forum_id
533 AND c.status < 4
534 JOIN t_post d
535 ON c.id = d.topic_id
536 AND d.memberid > 0
537 AND d.status in (0,25)
538 AND d.id = c.first_post_id
539 AND d.creation_time >= '$NEW_CONTENT_LOOKBACK_DATE'
540 GROUP BY a.geo_id
541 ) c
542 ON a.geo_id = c.geo_id
543
544 -- Get reviews from the past week
545 LEFT OUTER JOIN (
546 SELECT
547 geo_id,
548 sum(num_reviews) as new_review_count,
549 sum(if(property_type_id = $PLACE_TYPE_ACCOMMODATION, num_reviews, 0)) AS new_hotel_review_count,
550 sum(if(property_type_id = $PLACE_TYPE_EATERY, num_reviews, 0)) AS new_restaurant_review_count,
551 sum(if(property_type_id = $PLACE_TYPE_ATTRACTION, num_reviews, 0)) AS new_attraction_review_count
552 FROM ${TMP_PREFIX}_new_review_counts
553 GROUP BY geo_id
554 ) d
555 ON a.geo_id = d.geo_id
556
557 -- Get property type counts
558 LEFT OUTER JOIN (
559 SELECT
560 geo_id,
561 count(if(property_type_id = $PLACE_TYPE_ACCOMMODATION, 1, null)) AS hotel_count,
562 count(if(property_type_id = $PLACE_TYPE_EATERY, 1, null)) AS restaurant_count,
563 count(if(property_type_id = $PLACE_TYPE_ATTRACTION, 1, null)) AS attraction_count
564 FROM $TBL_ELIGIBLE_PROPERTIES
565 GROUP BY geo_id
566 ) e
567 ON a.geo_id = e.geo_id
568
569 -- Get special offer count
570 LEFT OUTER JOIN (
571 SELECT
572 b.geo_id,
573 count(*) AS special_offer_count
574 FROM a_geo_coupons a
575 JOIN $TBL_ELIGIBLE_PROPERTIES b
576 ON a.geo_id = b.geo_id AND a.hotel_id = b.property_id
577 WHERE a.lang = 'en'
578 GROUP BY b.geo_id
579 ) f
580 ON a.geo_id = f.geo_id
581
582 -- Get vacation rental count
583 LEFT OUTER JOIN (
584 SELECT
585 c.geo_id,
586 count(*) AS rental_count
587 FROM t_vacationrentaldata a
588 JOIN t_locationpaths b
589 ON a.locationid = b.locationid
590 AND b.pathtype = 1
591 JOIN $TBL_ELIGIBLE_GEOS c
592 ON b.parentid = c.geo_id
593 GROUP BY c.geo_id
594 ) g
595 ON a.geo_id = g.geo_id
596
597 -- Get temps
598 LEFT OUTER JOIN crm_weather_data h ON a.geo_id = h.locationid AND h.forecast_date = '${SEND_DATE}'"
599
600 # Notes
601 # ---------------------------------
602 # t_post.status: 0 -> normal [com.TripResearch.object.forum.PostInfo]
603 # t_post.status: 25 -> edited [com.TripResearch.object.forum.PostInfo]
604 # t_topic.status: 4 -> deleted [com.tripadvisor.forums.base.Topic]
605 # t_forum.status: 0 -> normal [com.tripadvisor.forums.base.Forum]
606 # t_userreview.pagetype: 1 -> regular review (not photo only, owner response, etc.)
607 # t_userreview.providerid: 0 -> tripadvisor
608 # t_userreview.status: 2 -> removed [com.TripResearch.object.ContentStatus]
609 # t_userreview.status: 4 -> published [com.TripResearch.object.ContentStatus]
610
611}
612
613function create_top_lists
614{
615 # Spotlight the most popular $MAX_LIST_LENGTH properties for each eligible
616 # geo in the following categories, which rotate weekly:
617 #
618 # [+] Top romantic restaurants
619 # [+] Top restaurants for lunch
620 # [+] Top restaurants for dessert
621 # [+] Top attractions
622 # [+] Top activities
623 # [+] Top shopping
624 # [+] Top tours
625
626 local MIN_REVIEW_RATING=3
627 local MAX_LIST_LENGTH=5
628
629
630
631 # Restaurant feature constants (see t_eateryfeature)
632 local EATERY_DATA_TYPE_FEATURE=1
633 local RESTAURANT_FEATURE_ROMANTIC=3
634 local RESTAURANT_FEATURE_FAMILY=5
635 local RESTAURANT_FEATURE_LUNCH=30
636 local RESTAURANT_FEATURE_DESSERT=59
637
638 # Attraction category constants (see t_category3 and AttractionCategories.java)
639 local ATTRACTION_CATEGORY_ATTRACTIONS="43, 44, 45, 46, 47, 48, 49, 50, 51, null"
640 local ATTRACTION_CATEGORY_ACTIVITIES="25, 36, 37, 38, 39, 40, 41, 42"
641 local ATTRACTION_CATEGORY_SHOPPING="26"
642 local ATTRACTION_CATEGORY_TOURS="42"
643
644 local TBL_TOP_LIST_TYPES=${TMP_PREFIX}_top_list_types
645
646 hive_command "CREATE TABLE IF NOT EXISTS $TBL_TOP_LIST_TYPES
647 (
648 index INT,
649 type STRING
650 )
651 COMMENT 'top list types'"
652
653 # temporarily remove all restaurant top lists
654 hive_command_compress_limit "INSERT OVERWRITE TABLE $TBL_TOP_LIST_TYPES
655 SELECT * FROM (
656 SELECT 0, 'top_attractions' FROM dual
657 UNION ALL
658 SELECT 1, 'top_activities' FROM dual
659 UNION ALL
660 SELECT 2, 'top_shopping' FROM dual
661 UNION ALL
662 SELECT 3, 'top_tours' FROM dual
663 ) a"
664
665 hive_command "CREATE TABLE IF NOT EXISTS $TBL_ELIGIBLE_RESTAURANT_FEATURES
666 (
667 property_id INT,
668 restaurant_feature_id INT
669 )
670 COMMENT 'eligible restaurant features'"
671
672 # TBL_ELIGIBLE_PROPERTIES is unique on geo_id, property_id so we need to de-dup here
673 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_ELIGIBLE_RESTAURANT_FEATURES
674 SELECT DISTINCT
675 a.property_id,
676 b.restaurant_feature_id
677 FROM $TBL_ELIGIBLE_PROPERTIES a
678 JOIN
679 (
680 SELECT locationid, value AS restaurant_feature_id
681 FROM t_eatery_data
682 WHERE type = $EATERY_DATA_TYPE_FEATURE
683 UNION ALL
684 SELECT locationid, featureid AS restaurant_feature_id
685 FROM t_eateryfeatures
686 ) b ON a.property_id = b.locationid
687 WHERE a.property_type_id = $PLACE_TYPE_EATERY"
688
689
690 # TBL_ELIGIBLE_PROPERTIES is unique on geo_id, property_id so we need to de-dup here
691 hive_command "CREATE TABLE IF NOT EXISTS $TBL_ELIGIBLE_ATTRACTION_CATEGORIES
692 (
693 property_id INT,
694 attraction_category_id INT
695 )
696 COMMENT 'eligible attraction categories'"
697
698 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_ELIGIBLE_ATTRACTION_CATEGORIES
699 SELECT DISTINCT
700 a.property_id,
701 b.categoryid AS attraction_category_id
702 FROM $TBL_ELIGIBLE_PROPERTIES a
703 JOIN t_attraction b
704 ON a.property_id = b.locationid
705 WHERE a.property_type_id = $PLACE_TYPE_ATTRACTION"
706
707
708 hive_command "CREATE TABLE IF NOT EXISTS $TBL_TOP_LISTS_AGGREGATE
709 (
710 geo_id INT,
711 property_id INT,
712 property_type_id INT,
713 top_list_type STRING,
714 popularity INT
715 )
716 COMMENT 'top lists base'"
717
718 local TOP_LIST_TYPE_COUNT=`$HIVEPCMD "SELECT COUNT(*) FROM $TBL_TOP_LIST_TYPES"`
719
720 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_TOP_LISTS_AGGREGATE
721 SELECT
722 b.geo_id,
723 b.property_id,
724 b.property_type_id,
725 a.top_list_type,
726 c.popularity
727 FROM (
728
729 SELECT
730 property_id,
731 'top_restaurants_romantic' AS top_list_type
732 FROM $TBL_ELIGIBLE_RESTAURANT_FEATURES
733 WHERE restaurant_feature_id = $RESTAURANT_FEATURE_ROMANTIC
734 UNION ALL
735 SELECT
736 property_id,
737 'top_restaurants_lunch' AS top_list_type
738 FROM $TBL_ELIGIBLE_RESTAURANT_FEATURES
739 WHERE restaurant_feature_id = $RESTAURANT_FEATURE_LUNCH
740 UNION ALL
741 SELECT
742 property_id,
743 'top_restaurants_dessert' AS top_list_type
744 FROM $TBL_ELIGIBLE_RESTAURANT_FEATURES
745 WHERE restaurant_feature_id = $RESTAURANT_FEATURE_DESSERT
746 UNION ALL
747 SELECT
748 property_id,
749 'top_restaurants_family' AS top_list_type
750 FROM $TBL_ELIGIBLE_RESTAURANT_FEATURES
751 WHERE restaurant_feature_id = $RESTAURANT_FEATURE_FAMILY
752 UNION ALL
753 SELECT
754 property_id,
755 'top_attractions' AS top_list_type
756 FROM $TBL_ELIGIBLE_ATTRACTION_CATEGORIES
757 WHERE attraction_category_id IN ($ATTRACTION_CATEGORY_ATTRACTIONS)
758 UNION ALL
759 SELECT
760 property_id,
761 'top_activities' AS top_list_type
762 FROM $TBL_ELIGIBLE_ATTRACTION_CATEGORIES
763 WHERE attraction_category_id IN ($ATTRACTION_CATEGORY_ACTIVITIES)
764 UNION ALL
765 SELECT
766 property_id,
767 'top_shopping' AS top_list_type
768 FROM $TBL_ELIGIBLE_ATTRACTION_CATEGORIES
769 WHERE attraction_category_id IN ($ATTRACTION_CATEGORY_SHOPPING)
770 UNION ALL
771 SELECT
772 property_id,
773 'top_tours' AS top_list_type
774 FROM $TBL_ELIGIBLE_ATTRACTION_CATEGORIES
775 WHERE attraction_category_id IN ($ATTRACTION_CATEGORY_TOURS)
776 ) a
777 JOIN $TBL_ELIGIBLE_PROPERTIES b
778 ON a.property_id = b.property_id
779 JOIN t_locationpopularity c
780 ON a.property_id = c.locationid AND c.segmentid = 0 AND b.geo_id = c.parentid
781 JOIN $TBL_TOP_LIST_TYPES d
782 ON d.index = ((b.geo_id + $WEEK_OF_YEAR) % $TOP_LIST_TYPE_COUNT) and d.type = a.top_list_type"
783
784
785 hive_command "CREATE TABLE IF NOT EXISTS $TBL_TOP_LISTS
786 (
787 geo_id INT,
788 property_id INT,
789 property_type_id INT,
790 property_review_count INT,
791 property_last_review_id INT,
792 property_last_review_lang STRING,
793 property_last_review_title STRING,
794 top_list_type STRING,
795 popularity INT,
796 rank INT
797 )
798 COMMENT 'top lists'"
799
800 hive_command_compress_limit_both "CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
801 INSERT OVERWRITE TABLE $TBL_TOP_LISTS
802 SELECT
803 geo_id,
804 property_id,
805 property_type_id,
806 property_review_count,
807 property_last_review_id,
808 property_last_review_lang,
809 property_last_review_title,
810 top_list_type,
811 popularity,
812 rank
813 FROM (
814 SELECT a.*, HiveUdfRank(concat(cast(geo_id AS STRING), property_last_review_lang)) as rank
815 FROM (
816
817 SELECT
818 a.geo_id,
819 a.property_id,
820 a.property_type_id,
821 b.num_reviews as property_review_count,
822 c.review_array[1] AS property_last_review_id,
823 c.review_lang AS property_last_review_lang,
824 substr(c.review_array[2], 0, 100) AS property_last_review_title,
825 a.top_list_type,
826 a.popularity
827
828 FROM $TBL_TOP_LISTS_AGGREGATE a
829
830 -- Get total review count for each property
831 JOIN $TBL_ELIGIBLE_PROPERTY_REVIEW_COUNTS b ON a.property_id = b.locationid
832
833 -- Get the latest review in each language for each property
834 JOIN (
835 SELECT
836 a.property_id,
837 max(array(b.review_published, b.review_id, b.review_title)) AS review_array,
838 b.review_lang
839 FROM $TBL_TOP_LISTS_AGGREGATE a
840 JOIN $TBL_ELIGIBLE_PROPERTY_REVIEWS b
841 ON a.property_id = b.property_id
842 AND b.review_rating >= $MIN_REVIEW_RATING
843 GROUP BY
844 a.property_id,
845 b.review_lang
846 ) c
847 ON a.property_id = c.property_id
848
849 DISTRIBUTE BY geo_id
850
851 SORT BY
852 geo_id,
853 property_last_review_lang,
854 popularity ASC
855 ) a
856 ) b
857 WHERE rank <= $MAX_LIST_LENGTH"
858}
859
860function create_trending
861{
862 # A mix of hotels, restaurants, and attractions that have gained the most
863 # percentage of page views over the last week
864
865 local MIN_VIEW_COUNT=100
866 local MIN_RATING=30
867 local MAX_PROPERTY_TYPE_COUNT=2
868 local TRENDING_HISTORY_LOOKBACK_DATE=$(date -d "$SEND_DATE - 3 weeks" +%F)
869
870
871 hive_command "CREATE TABLE IF NOT EXISTS a_trending_history
872 (
873 geo_id INT,
874 property_id INT
875 )
876 COMMENT 'historical trending properties'
877 PARTITIONED BY (ds STRING)"
878
879 # Make a dummy insert to initialize the trending history table (necessary if this is the
880 # first time the entire table is being created)
881 hive_command "INSERT OVERWRITE TABLE a_trending_history PARTITION (ds = '$SEND_DATE')
882 SELECT 0 AS geo_id, 0 as property_id FROM t_placetype WHERE false;"
883
884
885 hive_command "CREATE TABLE IF NOT EXISTS ${TMP_PREFIX}_trending_all
886 (
887 geo_id INT,
888 property_id INT,
889 property_type_id INT,
890 property_rating INT
891 )
892 COMMENT 'trending'"
893
894 hive_command_compress_limit_both "CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
895
896 INSERT OVERWRITE TABLE ${TMP_PREFIX}_trending_all
897
898 SELECT geo_id, property_id, property_type_id, property_rating FROM (
899 SELECT
900 geo_id,
901 property_id,
902 property_type_id,
903 property_rating,
904 HiveUdfRank(concat(geo_id, property_type_id)) as rank
905
906 FROM (
907
908 SELECT
909 a.geo_id,
910 a.property_id,
911 a.property_type_id,
912 if(a.new_view_count > $MIN_VIEW_COUNT, a.new_view_count / a.old_view_count, 0) AS view_growth,
913 l.score as property_rating
914
915 FROM (
916
917 SELECT
918 geo_id,
919 property_id,
920 property_type_id,
921 sum(if(week = 2, view_count, 0)) AS old_view_count,
922 sum(if(week = 1, view_count, 0)) AS new_view_count
923 FROM $TBL_UNIQUE_PROPERTY_VIEWS
924 GROUP BY
925 geo_id,
926 property_id,
927 property_type_id
928 ) a
929
930 -- Get property rating and only retain properties with at least $MIN_RATING rating.
931 JOIN t_locationratings l on a.property_id = l.locationid AND l.questionid = 0 AND l.segmentid = 0 and l.score >= $MIN_RATING
932
933 -- Suppress trending properties that have been featured in the campaign
934 -- since $TRENDING_HISTORY_LOOKBACK_DATE
935 LEFT OUTER JOIN a_trending_history c
936 ON a.geo_id = c.geo_id
937 AND a.property_id = c.property_id
938 AND c.ds >= '$TRENDING_HISTORY_LOOKBACK_DATE'
939
940 WHERE c.geo_id IS NULL
941
942 DISTRIBUTE BY geo_id
943
944 SORT BY
945 geo_id,
946 property_type_id,
947 view_growth DESC
948
949 ) a
950 ) b
951 WHERE rank <= $MAX_PROPERTY_TYPE_COUNT"
952
953 hive_command "CREATE TABLE IF NOT EXISTS ${TMP_PREFIX}_trending_counts
954 (
955 geo_id INT,
956 attr_count INT,
957 rest_count INT,
958 hotel_count INT
959 )
960 COMMENT 'trending property type counts'"
961
962 hive_command_compress_limit "INSERT OVERWRITE TABLE ${TMP_PREFIX}_trending_counts
963 SELECT geo_id, sum(if(property_type_id == 10021, 1, 0)) as attr_count,
964 sum(if(property_type_id == 10022, 1, 0)) as rest_count,
965 sum(if(property_type_id == 10023, 1, 0)) as hotel_count
966 FROM ${TMP_PREFIX}_trending_all
967 GROUP BY geo_id"
968
969 hive_command "CREATE TABLE IF NOT EXISTS $TBL_TRENDING LIKE ${TMP_PREFIX}_trending_all"
970
971 # Require trending geos to have at least 2 qualifying restaurants and attractions, as well as 1 hotel
972 hive_command_compress_limit "INSERT OVERWRITE TABLE $TBL_TRENDING
973 SELECT a.* FROM ${TMP_PREFIX}_trending_all a
974 LEFT SEMI JOIN ${TMP_PREFIX}_trending_counts c ON a.geo_id = c.geo_id AND
975 c.attr_count >= 2 AND c.rest_count >= 2 and c.hotel_count >= 1"
976}
977
978function create_preliminary_send_list
979{
980 local EMAIL_LOOKBACK_DATE=$(date -d '3 days ago' +%F)
981
982 # Create recipients table (i.e. send list without usernames)
983 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RECIPIENTS
984 (
985 member_id INT,
986 geo_id INT,
987 userid STRING,
988 email STRING,
989 locale STRING,
990 lang STRING,
991 home_geo_id INT,
992 home_airport_code STRING,
993 dest_airport_code STRING,
994 creative_type STRING
995 )
996 COMMENT 'recipients'"
997
998 hive_command_compress_limit_both "set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
999 CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
1000 INSERT OVERWRITE TABLE $TBL_RECIPIENTS
1001
1002 SELECT member_id, geo_id, userid, email, locale, lang, home_geo_id, home_airport_code, dest_airport_code, creative_type
1003 FROM (
1004 SELECT *, HiveUdfRank(concat(member_id, geo_id)) as rank
1005 FROM (
1006
1007 SELECT
1008 a.member_id,
1009 a.geo_id,
1010 b.userid,
1011 b.email,
1012 c.map_locale AS locale,
1013 substr(c.map_locale, 0, 2) AS lang,
1014 d.origin_locationid AS home_geo_id,
1015 d.airport_code AS home_airport_code,
1016 e.airportcode AS dest_airport_code,
1017 a.creative_type
1018
1019 FROM $TBL_WATCHED_LOCATIONS a
1020
1021 JOIN vw_t_member b
1022 ON a.member_id = b.memberid
1023 AND b.deliverability = 0
1024
1025 JOIN crm_locale_maps c
1026 ON b.locale = c.lang_code
1027
1028 LEFT OUTER JOIN (SELECT DISTINCT geo_id FROM $TBL_TRENDING) t on t.geo_id = a.geo_id
1029
1030 -- Get home geo and airport
1031 LEFT OUTER JOIN a_member_origin d
1032 ON a.member_id = d.memberid
1033
1034 -- Get airport nearest to watched geo
1035 LEFT OUTER JOIN t_nearestairports e
1036 ON a.geo_id = e.locationid
1037
1038 -- Filter out any users who may have received this email since $EMAIL_LOOKBACK_DATE
1039 LEFT OUTER JOIN (
1040 SELECT
1041 a.userid,
1042 c.nvp_int_val AS geo_id
1043 FROM f_ech_daily a
1044 JOIN d_email_type b
1045 ON a.email_type_id = b.email_type_id
1046 AND b.email_name_id = $EMAIL_NAME_ID
1047 JOIN f_email_sends_attributes c
1048 ON c.ds >= '$EMAIL_LOOKBACK_DATE'
1049 AND a.email_contact_id = c.email_contact_id
1050 AND c.nvp_name = 'geo_id'
1051 WHERE a.ds >= '$EMAIL_LOOKBACK_DATE'
1052 ) f
1053 ON b.userid = f.userid
1054 AND a.geo_id = f.geo_id
1055
1056 WHERE f.userid IS NULL
1057 -- filter recipients with trending creative type for geos without trending data
1058 AND NOT (a.creative_type = '${CREATIVE_TYPE_TRENDING}' AND t.geo_id IS NULL)
1059
1060 CLUSTER BY member_id, geo_id
1061
1062 ) a
1063 ) b
1064
1065 -- Defensive dedupe
1066 WHERE rank = 1"
1067
1068 local TBL_RECIPIENTS_UNIQ=${TBL_RECIPIENTS}_uniq
1069 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RECIPIENTS_UNIQ (member_id INT) COMMENT 'unique recipients'"
1070 hive_command_compress_limit "INSERT OVERWRITE TABLE $TBL_RECIPIENTS_UNIQ SELECT DISTINCT member_id FROM $TBL_RECIPIENTS"
1071
1072 # Collect usernames
1073 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RECIPIENT_USERNAMES
1074 (
1075 member_id INT,
1076 username STRING
1077 )
1078 COMMENT 'recipient usernames'"
1079
1080 collect_usernames $TBL_RECIPIENTS_UNIQ $TBL_RECIPIENT_USERNAMES "member_id"
1081
1082
1083 # Generate preliminary send list by combining recipients table with usernames.
1084 # The final send list will be obtained later after we can ensure that there are
1085 # a sufficient number of featured hotels to show the recipient.
1086 hive_command "CREATE TABLE IF NOT EXISTS $TBL_PRELIMINARY_SEND_LIST
1087 (
1088 member_id INT,
1089 geo_id INT,
1090 userid STRING,
1091 username STRING,
1092 email STRING,
1093 locale STRING,
1094 lang STRING,
1095 home_geo_id INT,
1096 home_airport_code STRING,
1097 dest_airport_code STRING,
1098 creative_type STRING,
1099 email_guid STRING,
1100 pool_num INT,
1101 pool STRING
1102 )
1103 COMMENT 'send all'"
1104
1105 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_PRELIMINARY_SEND_LIST
1106 SELECT
1107 a.member_id,
1108 a.geo_id,
1109 a.userid,
1110 b.username,
1111 a.email,
1112 a.locale,
1113 a.lang,
1114 a.home_geo_id,
1115 a.home_airport_code,
1116 a.dest_airport_code,
1117 a.creative_type,
1118 NULL AS email_guid,
1119 NULL AS pool_num,
1120 NULL AS pool
1121 FROM $TBL_RECIPIENTS a
1122 JOIN (
1123 SELECT DISTINCT member_id, username
1124 FROM $TBL_RECIPIENT_USERNAMES
1125 ) b
1126 ON a.member_id = b.member_id"
1127}
1128
1129function create_featured_hotels
1130{
1131 local MINUEND=1000000
1132 local MIN_REVIEW_RATING=3
1133 local MIN_RATING=30
1134 local MAX_POPULARITY=25
1135 local MAX_RESULTS=3
1136 local NEW_REVIEW_LOOKBACK_DATE=$ONE_WEEK_AGO
1137 local priority=0
1138
1139
1140
1141
1142
1143 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RECENTLY_VIEWED_HOTELS
1144 (
1145 member_id INT,
1146 geo_id INT,
1147 property_id INT,
1148 lang STRING
1149 )
1150 COMMENT 'recently viewed hotels'"
1151
1152 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_RECENTLY_VIEWED_HOTELS
1153 SELECT
1154 a.member_id,
1155 a.geo_id,
1156 b.property_id,
1157 a.lang
1158 FROM $TBL_PRELIMINARY_SEND_LIST a
1159 JOIN (
1160 SELECT DISTINCT
1161 userid,
1162 geo_id,
1163 property_id
1164 FROM $TBL_MEMBER_PROPERTY_VIEWS
1165 WHERE property_type_id = $PLACE_TYPE_ACCOMMODATION
1166 AND week <= 2
1167 ) b
1168 ON a.userid = b.userid
1169 AND a.geo_id = b.geo_id"
1170
1171
1172 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RECENTLY_VIEWED_HOTELS_WITH_NEW_REVIEW
1173 (
1174 member_id INT,
1175 geo_id INT,
1176 property_id INT,
1177 review_id INT,
1178 review_title STRING,
1179 score STRING
1180 )
1181 COMMENT 'recently viewed hotels with new review'"
1182
1183 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_RECENTLY_VIEWED_HOTELS_WITH_NEW_REVIEW
1184 SELECT
1185 member_id,
1186 geo_id,
1187 property_id,
1188 review_array[1] AS review_id,
1189 review_array[2] AS review_title,
1190 review_array[0] AS score
1191 FROM (
1192 SELECT
1193 a.member_id,
1194 a.geo_id,
1195 a.property_id,
1196 max(array(b.review_published, b.review_id, b.review_title)) as review_array
1197 FROM $TBL_RECENTLY_VIEWED_HOTELS a
1198 JOIN $TBL_ELIGIBLE_PROPERTY_REVIEWS b
1199 ON a.geo_id = b.geo_id
1200 AND a.property_id = b.property_id
1201 AND a.lang = b.review_lang
1202 AND b.review_rating >= $MIN_REVIEW_RATING
1203 AND b.review_published >= '$NEW_REVIEW_LOOKBACK_DATE'
1204 GROUP BY
1205 a.member_id,
1206 a.geo_id,
1207 a.property_id
1208 ) a"
1209
1210
1211 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RECENTLY_VIEWED_HOTELS_WITHOUT_NEW_REVIEW
1212 (
1213 member_id INT,
1214 geo_id INT,
1215 property_id INT,
1216 review_id INT,
1217 review_title STRING,
1218 score STRING
1219 )
1220 COMMENT 'recently viewed hotels without new review'"
1221
1222
1223 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_RECENTLY_VIEWED_HOTELS_WITHOUT_NEW_REVIEW
1224 SELECT
1225 member_id,
1226 geo_id,
1227 property_id,
1228 review_array[1] AS review_id,
1229 review_array[2] AS review_title,
1230 review_array[0] AS score
1231 FROM (
1232 SELECT
1233 a.member_id,
1234 a.geo_id,
1235 a.property_id,
1236 max(array(b.review_published, b.review_id, b.review_title)) as review_array
1237 FROM $TBL_RECENTLY_VIEWED_HOTELS a
1238 JOIN $TBL_ELIGIBLE_PROPERTY_REVIEWS b
1239 ON a.geo_id = b.geo_id
1240 AND a.property_id = b.property_id
1241 AND a.lang = b.review_lang
1242 AND b.review_rating >= $MIN_REVIEW_RATING
1243 GROUP BY
1244 a.member_id,
1245 a.geo_id,
1246 a.property_id
1247 ) a"
1248
1249
1250 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RECOMMENDED_HOTELS
1251 (
1252 member_id INT,
1253 geo_id INT,
1254 property_id INT,
1255 review_id INT,
1256 review_title STRING,
1257 score STRING
1258 )
1259 COMMENT 'recommended hotels'"
1260
1261 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_RECOMMENDED_HOTELS
1262 SELECT
1263 member_id,
1264 geo_id,
1265 property_id,
1266 review_array[1] AS review_id,
1267 review_array[2] AS review_title,
1268 cast(($MINUEND - rank) AS STRING) AS score
1269 FROM (
1270 SELECT
1271 a.member_id,
1272 a.geo_id,
1273 b.rec_hotel AS property_id,
1274 b.rank AS rank,
1275 max(array(c.review_published, c.review_id, c.review_title)) as review_array
1276 FROM $TBL_RECENTLY_VIEWED_HOTELS a
1277 JOIN crm_recommended_hotels_exploded b
1278 ON a.property_id = b.hotel_id
1279 JOIN $TBL_ELIGIBLE_PROPERTY_REVIEWS c
1280 ON b.rec_hotel = c.property_id
1281 AND a.geo_id = c.geo_id
1282 AND a.lang = c.review_lang
1283 AND c.review_rating >= $MIN_REVIEW_RATING
1284 GROUP BY
1285 a.member_id,
1286 a.geo_id,
1287 b.rec_hotel,
1288 b.rank
1289 ) a"
1290
1291
1292 hive_command "CREATE TABLE IF NOT EXISTS $TBL_POPULAR_HOTELS_WITH_NEW_REVIEWS
1293 (
1294 member_id INT,
1295 geo_id INT,
1296 property_id INT,
1297 review_id INT,
1298 review_title STRING,
1299 score STRING
1300 )
1301 COMMENT 'popular hotels with new reviews'"
1302
1303 hive_command_compress_limit_both "CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
1304 INSERT OVERWRITE TABLE $TBL_POPULAR_HOTELS_WITH_NEW_REVIEWS
1305 SELECT member_id, geo_id, property_id, review_id, review_title, score
1306 FROM (
1307 SELECT
1308 member_id,
1309 geo_id,
1310 property_id,
1311 review_id,
1312 review_title,
1313 score,
1314 HiveUdfRank(concat(member_id, geo_id, property_id)) as rank
1315 FROM (
1316 SELECT
1317 a.member_id,
1318 a.geo_id,
1319 b.property_id,
1320 b.review_id,
1321 b.review_title,
1322 b.review_published,
1323 b.score
1324 FROM $TBL_PRELIMINARY_SEND_LIST a
1325 JOIN (
1326 SELECT
1327 a.geo_id,
1328 a.lang,
1329 b.property_id,
1330 b.review_id,
1331 b.review_title,
1332 b.review_published,
1333 cast(($MINUEND - c.popularity) AS STRING) AS score
1334 FROM (SELECT DISTINCT geo_id, lang FROM $TBL_PRELIMINARY_SEND_LIST) a
1335 JOIN $TBL_ELIGIBLE_PROPERTY_REVIEWS b
1336 ON a.geo_id = b.geo_id
1337 AND a.lang = b.review_lang
1338 AND b.property_type_id = $PLACE_TYPE_ACCOMMODATION
1339 AND b.review_published >= '$NEW_REVIEW_LOOKBACK_DATE'
1340 AND b.review_rating >= $MIN_REVIEW_RATING
1341 JOIN t_locationpopularity c
1342 ON b.property_id = c.locationid
1343 AND c.segmentid = 0
1344 AND c.popularity <= $MAX_POPULARITY
1345 ) b ON a.geo_id = b.geo_id AND a.lang = b.lang
1346 DISTRIBUTE BY
1347 member_id,
1348 geo_id,
1349 property_id
1350 SORT BY
1351 member_id,
1352 geo_id,
1353 property_id,
1354 review_published DESC
1355 ) a
1356 ) b
1357 WHERE rank = 1"
1358
1359 hive_command "CREATE TABLE IF NOT EXISTS $TBL_HOTELS_WITH_NEW_REVIEWS
1360 (
1361 member_id INT,
1362 geo_id INT,
1363 property_id INT,
1364 review_id INT,
1365 review_title STRING,
1366 score STRING
1367 )
1368 COMMENT 'hotels with new reviews'"
1369
1370 hive_command_compress_limit_both "CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
1371 INSERT OVERWRITE TABLE $TBL_HOTELS_WITH_NEW_REVIEWS
1372 SELECT member_id, geo_id, property_id, review_id, review_title, score
1373 FROM (
1374 SELECT
1375 member_id,
1376 geo_id,
1377 property_id,
1378 review_id,
1379 review_title,
1380 score,
1381 HiveUdfRank(concat(member_id, geo_id, property_id)) as rank
1382 FROM (
1383 SELECT
1384 a.member_id,
1385 a.geo_id,
1386 b.property_id,
1387 b.review_id,
1388 b.review_title,
1389 b.score
1390 FROM $TBL_PRELIMINARY_SEND_LIST a
1391 JOIN (
1392 SELECT
1393 a.geo_id,
1394 a.lang,
1395 b.property_id,
1396 b.review_id,
1397 b.review_title,
1398 b.review_published AS score
1399 FROM (SELECT DISTINCT geo_id, lang FROM $TBL_PRELIMINARY_SEND_LIST) a
1400 JOIN $TBL_ELIGIBLE_PROPERTY_REVIEWS b
1401 ON a.geo_id = b.geo_id
1402 AND a.lang = b.review_lang
1403 AND b.property_type_id = $PLACE_TYPE_ACCOMMODATION
1404 AND b.review_published >= '$NEW_REVIEW_LOOKBACK_DATE'
1405 AND b.review_rating >= $MIN_REVIEW_RATING
1406 ) b ON a.geo_id = b.geo_id AND a.lang = b.lang
1407 DISTRIBUTE BY
1408 member_id,
1409 geo_id,
1410 property_id
1411 SORT BY
1412 member_id,
1413 geo_id,
1414 property_id,
1415 score DESC
1416 ) a
1417 ) b
1418 WHERE rank = 1"
1419
1420
1421 hive_command "CREATE TABLE IF NOT EXISTS $TBL_FEATURED_HOTELS_AGGREGATE
1422 (
1423 member_id INT,
1424 geo_id INT,
1425 locationid INT,
1426 review_id INT,
1427 review_title STRING,
1428 featured_type STRING,
1429 priority INT,
1430 score STRING
1431 )
1432 COMMENT 'aggregate featured reviews'"
1433
1434 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_FEATURED_HOTELS_AGGREGATE
1435 SELECT
1436 a.member_id,
1437 a.geo_id,
1438 a.property_id as locationid,
1439 a.review_array[1] AS review_id,
1440 a.review_array[2] AS review_title,
1441 a.review_array[3] AS featured_type,
1442 a.review_array[0] AS priority,
1443 a.review_array[4] AS score
1444 FROM (
1445 SELECT
1446 member_id,
1447 geo_id,
1448 property_id,
1449 max(array(priority, review_id, review_title, featured_type, score)) AS review_array
1450 FROM (
1451 SELECT
1452 member_id,
1453 geo_id,
1454 property_id,
1455 review_id,
1456 review_title,
1457 'recently_viewed' AS featured_type,
1458 $((priority++)) AS priority,
1459 score
1460 FROM $TBL_RECENTLY_VIEWED_HOTELS_WITH_NEW_REVIEW
1461 UNION ALL
1462 SELECT
1463 member_id,
1464 geo_id,
1465 property_id,
1466 review_id,
1467 review_title,
1468 'recently_viewed' AS featured_type,
1469 $((priority++)) AS priority,
1470 score
1471 FROM $TBL_RECENTLY_VIEWED_HOTELS_WITHOUT_NEW_REVIEW
1472 UNION ALL
1473 SELECT
1474 member_id,
1475 geo_id,
1476 property_id,
1477 review_id,
1478 review_title,
1479 'recommended' AS featured_type,
1480 $((priority++)) AS priority,
1481 score
1482 FROM $TBL_RECOMMENDED_HOTELS
1483 UNION ALL
1484 SELECT
1485 member_id,
1486 geo_id,
1487 property_id,
1488 review_id,
1489 review_title,
1490 'recommended' AS featured_type,
1491 $((priority++)) AS priority,
1492 score
1493 FROM $TBL_POPULAR_HOTELS_WITH_NEW_REVIEWS
1494 UNION ALL
1495 SELECT
1496 member_id,
1497 geo_id,
1498 property_id,
1499 review_id,
1500 review_title,
1501 'recommended' AS featured_type,
1502 $((priority++)) AS priority,
1503 score
1504 FROM $TBL_HOTELS_WITH_NEW_REVIEWS
1505 ) a
1506 GROUP BY
1507 member_id,
1508 geo_id,
1509 property_id
1510 ) a"
1511
1512 hive_command "CREATE TABLE IF NOT EXISTS $TBL_FEATURED_HOTELS
1513 (
1514 member_id INT,
1515 geo_id INT,
1516 property_id INT,
1517 property_review_count INT,
1518 property_rating INT,
1519 review_id INT,
1520 review_title STRING,
1521 featured_type STRING
1522 )
1523 COMMENT 'featured reviews'"
1524
1525 hive_command_compress_limit_both "CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
1526 INSERT OVERWRITE TABLE $TBL_FEATURED_HOTELS
1527 SELECT member_id, geo_id, property_id, property_review_count, property_rating, review_id, review_title, featured_type
1528 FROM (
1529 SELECT
1530 member_id,
1531 geo_id,
1532 property_id,
1533 property_review_count,
1534 property_rating,
1535 review_id,
1536 review_title,
1537 featured_type,
1538 HiveUdfRank(concat(member_id, geo_id)) as rank
1539 FROM (
1540 SELECT
1541 a.member_id,
1542 a.geo_id,
1543 a.locationid AS property_id,
1544 b.num_reviews AS property_review_count,
1545 c.score as property_rating,
1546 a.review_id,
1547 substr(a.review_title, 0, 100) AS review_title,
1548 a.featured_type,
1549 a.priority,
1550 a.score
1551 FROM $TBL_FEATURED_HOTELS_AGGREGATE a
1552 JOIN $TBL_ELIGIBLE_PROPERTY_REVIEW_COUNTS b on b.locationid = a.locationid
1553 JOIN t_locationratings c ON a.locationid = c.locationid AND c.questionid = 0 AND c.segmentid = 0 and c.score >= $MIN_RATING
1554 DISTRIBUTE BY member_id
1555 SORT BY
1556 member_id,
1557 geo_id,
1558 priority ASC,
1559 score DESC
1560 ) a
1561 ) b
1562 WHERE rank <= $MAX_RESULTS"
1563}
1564
1565function create_send_list
1566{
1567 # Filter out recipients who do not have $MIN_HOTEL_COUNT featured hotels
1568 local MIN_HOTEL_COUNT=3
1569
1570 hive_command "CREATE TABLE IF NOT EXISTS $TBL_SEND_LIST LIKE $TBL_PRELIMINARY_SEND_LIST"
1571
1572 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_SEND_LIST
1573 SELECT a.*
1574 FROM $TBL_PRELIMINARY_SEND_LIST a
1575 JOIN (
1576 SELECT
1577 member_id,
1578 geo_id,
1579 count(*) AS count
1580 FROM $TBL_FEATURED_HOTELS
1581 GROUP BY
1582 member_id,
1583 geo_id
1584 ) b
1585 ON a.member_id = b.member_id
1586 AND a.geo_id = b.geo_id
1587 AND b.count >= $MIN_HOTEL_COUNT"
1588}
1589
1590function create_friend_activity
1591{
1592 local NEW_CONTENT_LOOKBACK_DATE=$ONE_WEEK_AGO
1593 local MAX_RESULTS=2
1594
1595
1596 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RECENT_ACTIVITY
1597 (
1598 member_id INT,
1599 geo_id INT,
1600 activity_location_id INT,
1601 activity_location_type_id INT,
1602 activity_type STRING,
1603 priority INT
1604 )
1605 COMMENT 'recent activity'"
1606
1607 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_RECENT_ACTIVITY
1608 SELECT
1609 member_id,
1610 geo_id,
1611 activity_location_id,
1612 activity_data[1] AS activity_location_type_id,
1613 if(activity_data[2] = 'Review', 'review', 'pin') AS activity_type,
1614 activity_data[0] AS priority
1615 FROM (
1616 SELECT
1617 a.member_id,
1618 b.parentid AS geo_id,
1619 a.activity['location_id'] AS activity_location_id,
1620 min(array(a.activity['priority'], a.activity['location_type'], a.activity['type'])) AS activity_data
1621 FROM a_recent_activity a
1622 JOIN t_locationpaths b
1623 ON a.activity['location_id'] = b.locationid
1624 AND b.pathtype = 1
1625 LEFT SEMI JOIN $TBL_ELIGIBLE_GEOS s ON b.parentid = s.geo_id
1626 WHERE (
1627 activity['type'] = 'Review'
1628 AND activity['review_published'] >= '$NEW_CONTENT_LOOKBACK_DATE'
1629 )
1630 OR (
1631 activity['type'] = 'Pin'
1632 AND cast(activity['pin_type'] AS INT) & 1 = 0 -- Been pins only
1633 AND activity['pin_created'] >= '$NEW_CONTENT_LOOKBACK_DATE'
1634 )
1635 GROUP BY a.member_id, a.activity['location_id'], b.parentid
1636 ) a"
1637
1638 hive_command "CREATE TABLE IF NOT EXISTS ${TMP_PREFIX}_friend_activity_all
1639 (
1640 member_id INT,
1641 geo_id INT,
1642 friend_id INT,
1643 friend_fb_id STRING,
1644 priority INT,
1645 friend_name STRING,
1646 friend_first_name STRING,
1647 friend_last_name STRING,
1648 friend_gender STRING,
1649 friend_locale STRING,
1650 activity_location_id INT,
1651 activity_location_type_id INT,
1652 activity_type STRING
1653 )
1654 COMMENT 'all friend activity'"
1655
1656 hive_command_compress_limit_both "INSERT OVERWRITE TABLE ${TMP_PREFIX}_friend_activity_all
1657 SELECT
1658 a.member_id,
1659 a.geo_id,
1660 c.friend_id,
1661 c.friend_fb_id,
1662 r.priority,
1663 substr(d.name, 0, 100) AS friend_name,
1664 substr(d.first_name, 0, 50) AS friend_first_name,
1665 substr(d.last_name, 0, 50) AS friend_last_name,
1666 d.gender AS friend_gender,
1667 d.locale AS friend_locale,
1668 r.activity_location_id,
1669 r.activity_location_type_id,
1670 r.activity_type
1671
1672 FROM $TBL_SEND_LIST a
1673
1674 JOIN t_external_member b
1675 ON a.member_id = b.memberid
1676 AND b.idtype = 'FB'
1677
1678 JOIN (
1679 SELECT
1680 cast(b.friend_fb_id AS STRING) AS fb_id,
1681 a.member_id AS friend_id,
1682 a.fb_id AS friend_fb_id
1683 FROM (
1684 SELECT
1685 a.member_id,
1686 b.fb_id,
1687 b.friends
1688 FROM (SELECT DISTINCT member_id FROM $TBL_RECENT_ACTIVITY) a
1689 JOIN a_ta_friendmap b
1690 ON a.member_id = b.member_id
1691 ) a
1692 LATERAL VIEW explode(a.friends) b AS friend_fb_id
1693 ) c ON b.externalid = c.fb_id
1694
1695 JOIN $TBL_RECENT_ACTIVITY r on r.member_id = c.friend_id AND r.geo_id = a.geo_id
1696
1697 JOIN fb_global_details d
1698 ON c.friend_fb_id = d.my_fbid"
1699
1700 hive_command "CREATE TABLE IF NOT EXISTS $TBL_FRIEND_ACTIVITY
1701 (
1702 member_id INT,
1703 geo_id INT,
1704 friend_id INT,
1705 friend_fb_id STRING,
1706 friend_name STRING,
1707 friend_first_name STRING,
1708 friend_last_name STRING,
1709 friend_gender STRING,
1710 friend_locale STRING,
1711 activity_location_id INT,
1712 activity_location_type_id INT,
1713 activity_type STRING
1714 )
1715 COMMENT 'friend activity'"
1716
1717 hive_command_compress_limit_both "set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
1718 CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
1719 INSERT OVERWRITE TABLE $TBL_FRIEND_ACTIVITY
1720 SELECT member_id, geo_id, friend_id, friend_fb_id, friend_name, friend_first_name, friend_last_name, friend_gender,
1721 friend_locale, activity_location_id, activity_location_type_id, activity_type
1722 FROM (
1723 SELECT
1724 member_id,
1725 geo_id,
1726 friend_id,
1727 friend_fb_id,
1728 friend_name,
1729 friend_first_name,
1730 friend_last_name,
1731 friend_gender,
1732 friend_locale,
1733 activity_location_id,
1734 activity_location_type_id,
1735 activity_type,
1736 HiveUdfRank(concat(member_id, geo_id)) as rank
1737
1738 FROM (
1739
1740 SELECT
1741 member_id,
1742 geo_id,
1743 friend_id,
1744 friend_fb_id,
1745 priority,
1746 friend_name,
1747 friend_first_name,
1748 friend_last_name,
1749 friend_gender,
1750 friend_locale,
1751 activity_location_id,
1752 activity_location_type_id,
1753 activity_type
1754
1755 FROM ${TMP_PREFIX}_friend_activity_all
1756
1757 DISTRIBUTE BY
1758 member_id,
1759 geo_id
1760
1761 SORT BY
1762 member_id,
1763 geo_id,
1764 priority ASC
1765
1766 ) a
1767 ) b
1768 WHERE rank <= $MAX_RESULTS"
1769}
1770
1771function create_airfare
1772{
1773 hive_command "CREATE TABLE IF NOT EXISTS $TBL_AIRFARE
1774 (
1775 orig_airport_code STRING,
1776 dest_airport_code STRING,
1777 orig_location_id INT,
1778 dest_location_id INT,
1779 best_price STRING,
1780 currency STRING
1781 )
1782 COMMENT 'airfare'"
1783
1784 hive_command_compress_limit_both "INSERT OVERWRITE TABLE $TBL_AIRFARE
1785 SELECT
1786 a.orig_airport_code,
1787 a.dest_airport_code,
1788 a.orig_location_id,
1789 a.dest_location_id,
1790 min(b.current_fare / c.perdollar) AS price,
1791 'USD' AS currency
1792 FROM (
1793 SELECT DISTINCT
1794 a.home_airport_code AS orig_airport_code,
1795 a.dest_airport_code,
1796 b.locationid AS orig_location_id,
1797 c.locationid AS dest_location_id
1798 FROM $TBL_SEND_LIST a
1799 JOIN t_airportcodes b
1800 ON a.home_airport_code = b.airportcode
1801 JOIN t_airportcodes c
1802 ON a.dest_airport_code = c.airportcode
1803 ) a
1804 JOIN f_airwatch_raw_data b
1805 ON a.orig_airport_code = b.orig_airport
1806 AND a.dest_airport_code = b.dest_airport
1807 AND b.start_date >= '$TODAY'
1808 JOIN t_currency c
1809 ON b.fare_currency = c.code
1810 GROUP BY
1811 a.orig_airport_code,
1812 a.dest_airport_code,
1813 a.orig_location_id,
1814 a.dest_location_id"
1815}
1816
1817function create_special_offers
1818{
1819 local MAX_RESULTS=3;
1820
1821 hive_command "CREATE TABLE IF NOT EXISTS $TBL_SPECIAL_OFFERS
1822 (
1823 member_id INT,
1824 geo_id INT,
1825 property_id INT,
1826 property_type_id INT,
1827 coupon_id INT,
1828 coupon_teaser STRING
1829 )
1830 COMMENT 'special offers by eligible geo'"
1831
1832 hive_command_compress_limit_both "CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
1833 INSERT OVERWRITE TABLE $TBL_SPECIAL_OFFERS
1834 SELECT member_id, geo_id, property_id, property_type_id, coupon_id, coupon_teaser
1835 FROM (
1836 SELECT
1837 member_id,
1838 geo_id,
1839 property_id,
1840 property_type_id,
1841 coupon_id,
1842 coupon_teaser,
1843 HiveUdfRank(concat(member_id, geo_id, property_id)) as property_rank,
1844 HiveUdfRank(concat(member_id, geo_id)) as geo_rank
1845 FROM (
1846
1847 SELECT
1848 c.member_id,
1849 c.geo_id,
1850 b.property_id,
1851 b.property_type_id,
1852 a.coupon_id,
1853 a.coupon_teaser
1854
1855 FROM a_geo_coupons a
1856
1857 -- Retain coupons for eligible properties
1858 JOIN $TBL_ELIGIBLE_PROPERTIES b
1859 ON a.geo_id = b.geo_id AND a.hotel_id = b.property_id
1860
1861 -- Join coupons with recipients
1862 JOIN $TBL_SEND_LIST c
1863 ON b.geo_id = c.geo_id
1864 AND a.lang = c.lang
1865
1866 -- Discard any properties already used by 'featured hotels'
1867 LEFT OUTER JOIN $TBL_FEATURED_HOTELS d
1868 ON c.member_id = d.member_id
1869 AND c.geo_id = d.geo_id
1870 AND b.property_id = d.property_id
1871
1872 WHERE d.member_id IS NULL
1873
1874 DISTRIBUTE BY
1875 member_id,
1876 geo_id,
1877 property_id
1878
1879 SORT BY
1880 member_id,
1881 geo_id,
1882 property_id,
1883 coupon_id ASC
1884 ) a
1885 ) b
1886 WHERE property_rank = 1
1887 AND geo_rank <= $MAX_RESULTS"
1888}
1889
1890function create_featured_rentals
1891{
1892 local HIVE_OUTPUT_DIR="/tmp/hive_output"
1893 local RECENT_LOCATIONS_PREFIX="$HIVE_OUTPUT_DIR/${TMP_PREFIX}_recent_locations"
1894 local AIRPORT_LOCATIONS_PREFIX="$HIVE_OUTPUT_DIR/${TMP_PREFIX}_airport_locations"
1895 local RAW_DATA_FILENAME='/tmp/TripWatchRentalGenerator.txt'
1896 local IMPORT_DATA_FILENAME='/tmp/TripWatchRentalGenerator'
1897 local MAX_RESULTS=2
1898
1899 # Export recent locations from hive
1900 hive_command "set hive.exec.compress.output=false;
1901 INSERT OVERWRITE LOCAL DIRECTORY '$RECENT_LOCATIONS_PREFIX'
1902 SELECT geo_id, geo_name
1903 FROM $TBL_ELIGIBLE_GEOS"
1904
1905 # Generate csv file from recent locations
1906 javatr.sh \
1907 com.TripResearch.newsletter.syncmanager.HadoopDirToTsvFile \
1908 -hdpfolder $RECENT_LOCATIONS_PREFIX \
1909 -columns "geo_id,geo_name"
1910
1911 # Create dummy airport locations csv
1912 echo 'geo_id,geo_name' > $AIRPORT_LOCATIONS_PREFIX.csv
1913
1914
1915 # Generate VR data
1916 javatr.sh \
1917 -XX:+UseConcMarkSweepGC \
1918 -Xmx8g \
1919 -Xmx16g \
1920 com.TripResearch.newsletter.application.TripWatchRentalGenerator \
1921 -hive \
1922 -recent_locations $RECENT_LOCATIONS_PREFIX \
1923 -airport_locations $AIRPORT_LOCATIONS_PREFIX
1924
1925
1926 # Skip first record
1927 awk 'NR > 1' $RAW_DATA_FILENAME > $IMPORT_DATA_FILENAME
1928
1929
1930 # Import data into hive
1931 hive_command "CREATE TABLE IF NOT EXISTS $TBL_RENTALS
1932 (
1933 geo_id INT,
1934 property_id INT,
1935 property_name STRING,
1936 property_details STRING,
1937 property_thumbnail STRING,
1938 property_rating INT,
1939 property_order_type STRING,
1940 property_order INT,
1941 locale STRING
1942 )
1943 COMMENT 'rentals'
1944 ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
1945 STORED AS TEXTFILE"
1946
1947 hive_command "LOAD DATA
1948 LOCAL INPATH '$IMPORT_DATA_FILENAME'
1949 OVERWRITE INTO TABLE $TBL_RENTALS"
1950
1951
1952 # Generate featured rentals DEXT
1953 hive_command "CREATE TABLE IF NOT EXISTS $TBL_FEATURED_RENTALS
1954 (
1955 member_id INT,
1956 geo_id INT,
1957 property_id INT,
1958 property_name STRING,
1959 property_details STRING,
1960 property_thumbnail STRING
1961 )
1962 COMMENT 'featured rentals'"
1963
1964 hive_command_compress_limit_both "CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
1965 INSERT OVERWRITE TABLE $TBL_FEATURED_RENTALS
1966 SELECT member_id, geo_id, property_id, property_name, property_details, property_thumbnail
1967 FROM (
1968 SELECT
1969 member_id,
1970 geo_id,
1971 property_id,
1972 substr(property_name, 0, 100) AS property_name,
1973 property_details,
1974 property_thumbnail,
1975 HiveUdfRank(concat(member_id, geo_id)) as rank
1976 FROM (
1977 SELECT
1978 a.member_id,
1979 a.geo_id,
1980 b.property_id,
1981 b.property_name,
1982 b.property_details,
1983 b.property_thumbnail,
1984 b.property_order
1985 FROM $TBL_SEND_LIST a
1986 JOIN $TBL_RENTALS b
1987 ON a.geo_id = b.geo_id
1988 AND a.locale = b.locale
1989 AND b.property_thumbnail > ''
1990 DISTRIBUTE BY
1991 member_id,
1992 geo_id
1993 SORT BY
1994 member_id,
1995 geo_id,
1996 property_order ASC
1997 ) a
1998 ) b
1999 WHERE rank <= $MAX_RESULTS"
2000
2001}
2002
2003function create_test_send_list
2004{
2005 # Create a test send list representing each display scenario
2006
2007 local MAX_FRIENDS=2
2008 local MAX_RENTALS=2
2009
2010 hive_command "CREATE TABLE IF NOT EXISTS $TBL_TEST_SEND_LIST LIKE $TBL_SEND_LIST"
2011
2012 hive_command_compress_limit_both "CREATE TEMPORARY FUNCTION HiveUdfRank AS 'com.tripadvisor.warehouse.hive.HiveUdfRank';
2013 INSERT OVERWRITE TABLE $TBL_TEST_SEND_LIST
2014 SELECT b.*
2015 FROM (
2016
2017 SELECT DISTINCT
2018 member_id,
2019 geo_id
2020 FROM (
2021
2022 -- Get one recipient for each creative type
2023 SELECT
2024 member_id,
2025 geo_id,
2026 HiveUdfRank(creative_type) as rank
2027 FROM (
2028 SELECT
2029 member_id,
2030 geo_id,
2031 creative_type
2032 FROM $TBL_SEND_LIST
2033 CLUSTER BY creative_type
2034 ) a
2035
2036 UNION ALL
2037
2038 -- Get one recipient for each value from 0 to $MAX_FRIENDS friends
2039 SELECT
2040 member_id,
2041 geo_id,
2042 HiveUdfRank(count) as rank
2043 FROM (
2044 SELECT
2045 a.member_id,
2046 a.geo_id,
2047 sum(if(b.member_id IS NULL, 0, 1)) AS count
2048 FROM $TBL_SEND_LIST a
2049 LEFT OUTER JOIN $TBL_FRIEND_ACTIVITY b
2050 ON a.member_id = b.member_id
2051 AND a.geo_id = b.geo_id
2052 GROUP BY
2053 a.member_id,
2054 a.geo_id
2055 CLUSTER BY count
2056 ) a
2057 WHERE count <= $MAX_FRIENDS
2058
2059 UNION ALL
2060
2061 -- Get one recipient with neither airfare nor offers, just airfare, and just offers
2062 SELECT
2063 member_id,
2064 geo_id,
2065 HiveUdfRank(type) as rank
2066 FROM (
2067 SELECT
2068 a.member_id,
2069 a.geo_id,
2070 CASE
2071 WHEN b.orig_airport_code IS NULL AND c.member_id IS NULL THEN 0
2072 WHEN b.orig_airport_code IS NOT NULL AND c.member_id IS NULL then 1
2073 WHEN b.orig_airport_code IS NULL AND c.member_id IS NOT NULL then 2
2074 ELSE 3
2075 END AS type
2076 FROM $TBL_SEND_LIST a
2077 LEFT OUTER JOIN $TBL_AIRFARE b
2078 ON a.home_airport_code = b.orig_airport_code
2079 AND a.dest_airport_code = b.dest_airport_code
2080 LEFT OUTER JOIN $TBL_SPECIAL_OFFERS c
2081 ON a.member_id = c.member_id
2082 AND a.geo_id = c.geo_id
2083 CLUSTER BY type
2084 ) a
2085
2086 UNION ALL
2087
2088 -- Get one recipient for each value from 0 to $MAX_RENTALS rentals
2089 SELECT
2090 member_id,
2091 geo_id,
2092 HiveUdfRank(count) as rank
2093 FROM (
2094 SELECT
2095 a.member_id,
2096 a.geo_id,
2097 sum(if(b.member_id IS NULL, 0, 1)) AS count
2098 FROM $TBL_SEND_LIST a
2099 LEFT OUTER JOIN $TBL_FEATURED_RENTALS b
2100 ON a.member_id = b.member_id
2101 AND a.geo_id = b.geo_id
2102 GROUP BY
2103 a.member_id,
2104 a.geo_id
2105 CLUSTER BY count
2106 ) a
2107 WHERE count <= $MAX_RENTALS
2108
2109 UNION ALL
2110 SELECT member_id, geo_id,
2111 HiveUdfRank(concat(featured_type, cast(featured_count AS string))) as rank
2112 FROM (
2113 SELECT a.member_id, a.geo_id, featured_type, count(*) as featured_count
2114 FROM $TBL_SEND_LIST a
2115 JOIN $TBL_FEATURED_HOTELS b on a.member_id = b.member_id AND a.geo_id = b.geo_id
2116 GROUP BY a.member_id, a.geo_id, featured_type
2117 CLUSTER BY featured_type, featured_count
2118 ) a
2119
2120 UNION ALL
2121 SELECT member_id, geo_id, HiveUdfRank(top_list_type) as rank
2122 FROM (
2123 SELECT a.member_id, a.geo_id, b.top_list_type
2124 FROM $TBL_SEND_LIST a
2125 JOIN $TBL_TOP_LISTS b ON a.geo_id = b.geo_id AND a.lang = b.property_last_review_lang
2126 CLUSTER BY b.top_list_type
2127 ) a
2128 ) a
2129 WHERE rank = 1
2130 ) a
2131 JOIN $TBL_SEND_LIST b
2132 ON a.member_id = b.member_id
2133 AND a.geo_id = b.geo_id"
2134}
2135
2136function export_from_hive
2137{
2138 echo "Exporting from hive"
2139 dump_data_to_local_dir_with_ts "/tmp/hive_output/DBV_tripwatch_geo_details" "$TBL_GEO_DETAILS"
2140 dump_data_to_local_dir_with_ts "/tmp/hive_output/DBV_tripwatch_top_lists" "$TBL_TOP_LISTS"
2141 dump_data_to_local_dir_with_ts "/tmp/hive_output/DBV_tripwatch_trending" "$TBL_TRENDING"
2142 dump_data_to_local_dir_with_ts "/tmp/hive_output/send_tripwatch_all" "$TBL_SEND_LIST" "member_id,geo_id,userid,username,email,locale,lang,home_geo_id,home_airport_code,dest_airport_code,creative_type"
2143 dump_data_to_local_dir_with_ts "/tmp/hive_output/DBV_tripwatch_featured_hotels" "$TBL_FEATURED_HOTELS"
2144 dump_data_to_local_dir_with_ts "/tmp/hive_output/DBV_tripwatch_friend_activity" "$TBL_FRIEND_ACTIVITY"
2145 dump_data_to_local_dir_with_ts "/tmp/hive_output/DBV_tripwatch_airfare" "$TBL_AIRFARE"
2146 dump_data_to_local_dir_with_ts "/tmp/hive_output/DBV_tripwatch_special_offers" "$TBL_SPECIAL_OFFERS"
2147 dump_data_to_local_dir_with_ts "/tmp/hive_output/DBV_tripwatch_featured_rentals" "$TBL_FEATURED_RENTALS"
2148}
2149
2150function record_trending_history
2151{
2152 # Record trending properties to avoid repeating properties in future sends
2153 hive_command_compress_limit_both "INSERT OVERWRITE TABLE a_trending_history PARTITION (ds = '$SEND_DATE')
2154 SELECT DISTINCT geo_id, property_id
2155 FROM $TBL_TRENDING"
2156}
2157
2158function load_to_dexts
2159{
2160 load_hive_table_to_dext \
2161 $TBL_GEO_DETAILS \
2162 "/tmp/hive_output/DBV_tripwatch_geo_details" \
2163 "geo_id,send_date,new_view_count,new_forum_topic_count,new_review_count,new_hotel_review_count,new_restaurant_review_count,new_attraction_review_count,hotel_count,restaurant_count,attraction_count,special_offer_count,rental_count,temp_f_high,temp_f_low,temp_c_high,temp_c_low" \
2164 "DBV_tripwatch_geo_details"
2165
2166 load_hive_table_to_dext \
2167 $TBL_TOP_LISTS \
2168 "/tmp/hive_output/DBV_tripwatch_top_lists" \
2169 "geo_id,property_id,property_type_id,property_review_count,property_last_review_id,property_last_review_lang,property_last_review_title,top_list_type,popularity,rank" \
2170 "DBV_tripwatch_top_lists"
2171
2172 load_hive_table_to_dext \
2173 $TBL_TRENDING \
2174 "/tmp/hive_output/DBV_tripwatch_trending" \
2175 "geo_id,property_id,property_type_id,property_rating" \
2176 "DBV_tripwatch_trending"
2177
2178 load_hive_table_to_dext \
2179 $TBL_SEND_LIST \
2180 "/tmp/hive_output/send_tripwatch_all" \
2181 "member_id,geo_id,userid,username,email,locale,lang,home_geo_id,home_airport_code,dest_airport_code,creative_type,email_guid,pool_num,pool" \
2182 "send_tripwatch_all"
2183
2184 load_hive_table_to_dext \
2185 $TBL_FEATURED_HOTELS \
2186 "/tmp/hive_output/DBV_tripwatch_featured_hotels" \
2187 "member_id,geo_id,property_id,property_review_count,property_rating,review_id,review_title,featured_type" \
2188 "DBV_tripwatch_featured_hotels"
2189
2190 load_hive_table_to_dext \
2191 $TBL_FRIEND_ACTIVITY \
2192 "/tmp/hive_output/DBV_tripwatch_friend_activity" \
2193 "member_id,geo_id,friend_id,friend_fb_id,friend_name,friend_first_name,friend_last_name,friend_gender,friend_locale,activity_location_id,activity_location_type_id,activity_type" \
2194 "DBV_tripwatch_friend_activity"
2195
2196 load_hive_table_to_dext \
2197 $TBL_AIRFARE \
2198 "/tmp/hive_output/DBV_tripwatch_airfare" \
2199 "orig_airport_code,dest_airport_code,orig_location_id,dest_location_id,best_price,currency" \
2200 "DBV_tripwatch_airfare"
2201
2202 load_hive_table_to_dext \
2203 $TBL_SPECIAL_OFFERS \
2204 "/tmp/hive_output/DBV_tripwatch_special_offers" \
2205 "member_id,geo_id,property_id,property_type_id,coupon_id,coupon_teaser" \
2206 "DBV_tripwatch_special_offers"
2207
2208 load_hive_table_to_dext \
2209 $TBL_FEATURED_RENTALS \
2210 "/tmp/hive_output/DBV_tripwatch_featured_rentals" \
2211 "member_id,geo_id,property_id,property_name,property_details,property_thumbnail" \
2212 "DBV_tripwatch_featured_rentals"
2213
2214 load_hive_table_to_dext \
2215 $TBL_TEST_SEND_LIST \
2216 "/tmp/hive_output/send_tripwatch_test" \
2217 "member_id,geo_id,userid,username,email,locale,lang,home_geo_id,home_airport_code,dest_airport_code,creative_type,email_guid,pool_num,pool" \
2218 "send_tripwatch_test"
2219}
2220
2221function main
2222{
2223 echo "Generating data for TripWatch campaign"
2224 verify_dependencies
2225 create_working_database
2226
2227 # Per-geo content
2228 create_watched_locations
2229 create_eligible_geos
2230 create_eligible_properties
2231 create_eligible_property_review_counts
2232 create_eligible_property_reviews
2233 create_unique_property_views
2234 load_weather_data
2235 create_geo_details
2236 create_top_lists
2237 create_trending
2238
2239 # Per-member content
2240 create_preliminary_send_list
2241 create_featured_hotels
2242 create_send_list
2243 create_friend_activity
2244 create_airfare
2245 create_special_offers
2246 create_featured_rentals
2247 create_test_send_list
2248
2249 export_from_hive
2250
2251 # For live sends, record trending properties
2252 if [ "$LIVE" = 1 ]; then
2253 record_trending_history
2254 fi
2255
2256 # For testing purposes only
2257 # load_to_dexts
2258}
2259
2260main