Hive Partitioning and Bucketing Example on Twitter Data
Overview on Hive Partitioning :
Hive organizes tables into partitions. It is a way of dividing a table into related parts based on the values of partitioned columns such as date, city, and department. Using partition, it is easy to query a portion of the data.
Overview on Hive Bucketing :
The Hive Partition can be further subdivided into Clusters or Buckets.Hive Buckets is nothing but another technique of decomposing data or decreasing the data into more manageable parts or equal parts.
The dataset :
Tweet ID
Username
Text
Created Date
Profile Location
Favc
Retweet
Retweet Count
Count of Followers
Script :
Create table with Twitter Data---
create table twitter04(tweetId BIGINT, username STRING,txt STRING,CreatedAt STRING,
profileLocation STRING,favc BIGINT,retweet STRING,retcount BIGINT,followerscount BIGINT)
row format delimited
fields terminated by '\t'
stored as textfile;
Load data from Input file (Twitterdata.txt) to table (twitter) :
Load data local inpath '/root/manish/Twitterdata.txt' overwrite into table twitter04;
# If you are using data from HDFS then you don't have to mentioned "local".
Create table with partitioning--
Create table partitiontwitter(tweetId BIGINT, username STRING,txt
STRING,favc BIGINT,retweet STRING,retcount BIGINT,followerscount BIGINT)
partitioned by(CreatedAt String,profileLocation STRING)
row format delimited
fields terminated by '\t'
stored as textfile;
Load data from twitter table to Partitioning table :
insert overwrite table partitiontwitter
partition (CreatedAt="26 04:50:56 UTC 2014",profileLocation="Chicago")
select tweetId,username,txt,favc,retweet,retcount,followerscount
from twitter04 where profileLocation='Chicago' limit 50;
Create table with bucketing--
insert overwrite table partitiontwitter
partition (CreatedAt="26 04:50:56 UTC 2014",profileLocation="Chicago")
select tweetId,username,txt,favc,retweet,retcount,followerscount
from twitter where profileLocation='Chicago' limit 50;
create table buckettwitter(tweetId BIGINT, username STRING,txt STRING,
CreatedAt STRING,favc BIGINT,retweet STRING,retcount BIGINT, followerscount BIGINT)
partitioned by(profileLocation STRING)
clustered by(tweetId) into 2 buckets
row format delimited
fields terminated by '\t'
stored as textfile;
set hive.enforce.bucketing ='true';
Load data from twitter table to Bucketing table :
insert overwrite table buckettwitter partition(profileLocation="Chicago")
select tweetId BIGINT, username STRING,txt STRING,CreatedAt STRING,favc BIGINT,retweet STRING,retcount BIGINT, followerscount BIGINT
from twitter
where profileLocation = 'Chicago' limit 100;