(1)建student & student1 表:(hive 托管)
    
    create table student(id INT, age INT, name STRING)
    
    partitioned by(stat_date STRING) 
    
    clustered by(id) sorted by(age) into 4 buckets
    
    row format delimited fields terminated by ','; 
    
    
    create table studentrc(id INT, age INT, name STRING)
    
    partitioned by(stat_date STRING) 
    
    clustered by(id) sorted by(age) into 4 buckets
    
    row format delimited fields terminated by ',' stored as rcfile;
    
    
    create table studentlzo(id INT, age INT, name STRING)
    
    partitioned by(stat_date STRING) 
    
    clustered by(id) sorted by(age) into 4 buckets
    
    row format delimited fields terminated by ',' stored as rcfile;
  
    文件格式 textfile, sequencefile, rcfile
    
    (2)设置环境变量:
    
    set hive.enforce.bucketing = true; 
    
    (3)插入数据:
    
      LOAD DATA local INPATH '/home/hadoop/hivetest1.txt' OVERWRITE INTO TABLE student partition(stat_date="20120802");
    
    
    
    (CPU使用率很高)
    
    from student 
    
    insert overwrite table student1 partition(stat_date="20120802") 
    
    select id,age,name where stat_date="20120802" sort by age;
    
    
    查看数据
    
    select id, age, name from student  distribute by id ; // distribute相当于mapreduce中的key 
    
    
    
    抽选数据(一般测试的情况下使用)
    
    select * from student tablesample(bucket 1 out of 2 on id);
    
    TABLESAMPLE(BUCKET x OUT OF y)
    
    其中, x必须比y小, y必须是在创建表的时候bucket on的数量的因子或者倍数, hive会根据y的大小来决定抽样多少, 比如原本分了32分, 当y=16时, 抽取32/16=2分, 这时TABLESAMPLE(BUCKET 3 OUT OF 16) 就意味着要抽取第3和第16+3=19分的样品. 如果y=64, 这要抽取 32/64=1/2份数据, 这时TABLESAMPLE(BUCKET 3 OUT OF 64) 意味着抽取第3份数据的一半来进行.
    
    
    rcfile操作
    
    
    // 导入(gzip压缩)
    
    set hive.enforce.bucketing=true; 
    
    set hive.exec.compress.output=true;  
    
    set mapred.output.compress=true;  
    
    set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;  
    
    set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;  
    
    from student
    
    insert overwrite table studentrc partition(stat_date="20120802")  
    
    select id,age,name where stat_date="20120802" sort by age;
    
    
    
    // lzo压缩
    
    set hive.io.rcfile.record.buffer.size = 16777216; // 16 * 1024 * 1024
    
    set io.file.buffer.size = 131072; // 缓冲区大小 128 * 1024
    
    
    set hive.enforce.bucketing=true; 
    
    set hive.exec.compress.output=true;  
    
    set mapred.output.compress=true;  
    
    set mapred.output.compression.codec=com.hadoop.compression.lzo.LzoCodec;  
    
    set io.compression.codecs=com.hadoop.compression.lzo.LzoCodec;  
    
    from student
    
    insert overwrite table studentlzo partition(stat_date="20120802")  
    
    select id,age,name where stat_date="20120802" sort by age;
    
    
    // sequencefile导入
    
    set hive.exec.compress.output=true;  
    
    set mapred.output.compress=true;  
    
    set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;  
    
    set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;  
    
    insert overwrite table studentseq select * from student;
  


 
					 
					