[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-13 Thread stanlyxiang
Github user stanlyxiang commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r209506292
  
--- Diff: contrib/exthdfs/exthdfs.c ---
@@ -0,0 +1,469 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "postgres.h"
+
+#include "common.h"
+#include "access/extprotocol.h"
+#include "cdb/cdbdatalocality.h"
+#include "storage/fd.h"
+#include "storage/filesystem.h"
+#include "utils/uri.h"
+
+
+
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(hdfsprotocol_blocklocation);
+PG_FUNCTION_INFO_V1(hdfsprotocol_validate);
+
+Datum hdfsprotocol_blocklocation(PG_FUNCTION_ARGS);
+Datum hdfsprotocol_validate(PG_FUNCTION_ARGS);
+
+Datum hdfsprotocol_blocklocation(PG_FUNCTION_ARGS)
+{
+
+   // Build the result instance
+   int nsize = 0;
+   int numOfBlock = 0;
+   ExtProtocolBlockLocationData *bldata =
+   palloc0(sizeof(ExtProtocolBlockLocationData));
+   if (bldata == NULL)
+   {
+   elog(ERROR, "hdfsprotocol_blocklocation : "
+"cannot allocate due to no memory");
+   }
+   bldata->type = T_ExtProtocolBlockLocationData;
+   fcinfo->resultinfo = bldata;
+
+   ExtProtocolValidatorData *pvalidator_data = (ExtProtocolValidatorData *)
+   
(fcinfo->context);
+
+
+// Parse URI of the first location, we expect all locations uses the 
same
+// name node server. This is checked in validation function.
+
+   char *first_uri_str = (char 
*)strVal(lfirst(list_head(pvalidator_data->url_list)));
+   Uri *uri = ParseExternalTableUri(first_uri_str);
+
+   elog(DEBUG3, "hdfsprotocol_blocklocation : "
+"extracted HDFS name node address %s:%d",
+uri->hostname, uri->port);
+
+   // Create file system instance
+   hdfsFS fs = hdfsConnect(uri->hostname, uri->port);
+   if (fs == NULL)
+   {
+   elog(ERROR, "hdfsprotocol_blocklocation : "
+   "failed to create HDFS instance to 
connect to %s:%d",
+   uri->hostname, uri->port);
+   }
+
+   // Clean up uri instance as we don't need it any longer
+   pfree(uri);
--- End diff --

please use "FreeExternalTableUri" to all Uri


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208448574
  
--- Diff: src/backend/commands/analyze.c ---
@@ -3266,3 +3298,380 @@ static void 
gp_statistics_estimate_reltuples_relpages_parquet(Relation rel, floa
pfree(fstotal);
return;
 }
+
+/**
+ * This method estimates the number of tuples and pages in an extern 
relation. We can not get accurate tuple counts
+ * and pages counts in the catalog. Therefore, we have to get reltuples 
and relpages manually.
+ *
+ * Input:
+ * rel - Relation. Must be an external table.
+ *
+ * Output:
+ * reltuples - exact number of tuples in relation.
+ * relpages  - exact number of pages.
+ */
+static void gp_statistics_estimate_reltuples_relpages_external(Relation 
rel, float4 *relTuples, float4 *relPages){
+   Oid extRelationOid = RelationGetRelid(rel);
+   getExternalRelTuples(extRelationOid, relTuples);
+   getExternalRelPages(extRelationOid, relPages, rel);
+}
+
+/**
+ * This method called by analyzeExternalEstimateReltuplesRelpages,
+ * to get External Relation reltuple counts, we run count(*) sql manually
+ *
+ * Input:
+ * extRelationOid - External Table Relation Oid
+ * Output:
+ * relTuples - exact number of tuples in relation.
+ */
+static void getExternalRelTuples(Oid extRelationOid, float4 *relTuples){
+   const char *schemaName = NULL;
+   const char *tableName = NULL;
+   schemaName = get_namespace_name(get_rel_namespace(extRelationOid)); /* 
must be pfreed */
+   tableName = get_rel_name(extRelationOid); /* must be pfreed */
+
+   StringInfoData str;
+   initStringInfo();
+   appendStringInfo(, "select count(*)::float4 from %s.%s as Ta",
+   quote_identifier(schemaName),
+   quote_identifier(tableName));
+
+   spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */,
+   
spiCallback_getSingleResultRowColumnAsFloat4, relTuples);
+   pfree((void *) tableName);
+   pfree((void *) schemaName);
+   pfree(str.data);
+}
+
+/**
+ * This method called by analyzeExternalEstimateReltuplesRelpages,to get 
External Relation relpages counts.
+ * We call GetExtTableEntry method to get get List of external Table 
Locations.And then we go through every
+ * location url to sum the count of relpages.External Relation now support 
some different protocals, therefore
+ * we need to process them in different way.
+ *
+ * Input:
+ * extRelationOid - External Table Relation Oid
+ * Output:
+ * relTuples - exact number of pages in relation.
+ */
+static void getExternalRelPages(Oid extRelationOid, float4 *relPages , 
Relation rel){
+
+   ExtTableEntry* entry = GetExtTableEntry(extRelationOid);
+   List* extLocations = entry->locations;
+   int num_urls = list_length(extLocations);
+   ListCell *cell = list_head(extLocations);
+   ListCell *cellTmp = NULL;
+   while(cell != NULL)
+   {
+   char *url = pstrdup(((Value*)lfirst(cell))->val.str);
+   Assert(url != NULL);
+   Uri *uri = ParseExternalTableUri(url);
+   Assert(uri != NULL);
+   switch (uri->protocol){
+   case URI_HDFS:
+   *relPages += getExtrelPagesHDFS(uri);
+   break;
+
+   /*
+* to be done
+*/
+   case URI_GPFDIST:
+*relPages = 1.0;
+elog(NOTICE,"In external table ANALYZE command are not 
supported in GPFDIST location so far.");
+break;
+case URI_FILE:
+*relPages = 1.0;
+elog(NOTICE,"In external table ANALYZE command are not 
supported in FILE location so far.");
+break;
+   case URI_FTP:
+   *relPages = 1.0;
+   elog(NOTICE,"In external table ANALYZE command 
are not supported in FTP location so far.");
+break;
+   case URI_HTTP:
+   *relPages = 1.0;
+   elog(NOTICE,"In external table ANALYZE command 
are not supported in HTTP location so far.");
+break;
+   case URI_CUSTOM:
+   *relPages = 1.0;
+   elog(NOTICE,"In external table ANALYZE command 
are not supported in CUSTOM location so far.");
+break;
+   case URI_GPFDISTS:
+   

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208439468
  
--- Diff: src/backend/commands/analyze.c ---
@@ -3266,3 +3298,380 @@ static void 
gp_statistics_estimate_reltuples_relpages_parquet(Relation rel, floa
pfree(fstotal);
return;
 }
+
+/**
+ * This method estimates the number of tuples and pages in an extern 
relation. We can not get accurate tuple counts
+ * and pages counts in the catalog. Therefore, we have to get reltuples 
and relpages manually.
+ *
+ * Input:
+ * rel - Relation. Must be an external table.
+ *
+ * Output:
+ * reltuples - exact number of tuples in relation.
+ * relpages  - exact number of pages.
+ */
+static void gp_statistics_estimate_reltuples_relpages_external(Relation 
rel, float4 *relTuples, float4 *relPages){
+   Oid extRelationOid = RelationGetRelid(rel);
+   getExternalRelTuples(extRelationOid, relTuples);
+   getExternalRelPages(extRelationOid, relPages, rel);
+}
+
+/**
+ * This method called by analyzeExternalEstimateReltuplesRelpages,
+ * to get External Relation reltuple counts, we run count(*) sql manually
+ *
+ * Input:
+ * extRelationOid - External Table Relation Oid
+ * Output:
+ * relTuples - exact number of tuples in relation.
+ */
+static void getExternalRelTuples(Oid extRelationOid, float4 *relTuples){
+   const char *schemaName = NULL;
+   const char *tableName = NULL;
+   schemaName = get_namespace_name(get_rel_namespace(extRelationOid)); /* 
must be pfreed */
+   tableName = get_rel_name(extRelationOid); /* must be pfreed */
+
+   StringInfoData str;
+   initStringInfo();
+   appendStringInfo(, "select count(*)::float4 from %s.%s as Ta",
+   quote_identifier(schemaName),
+   quote_identifier(tableName));
+
+   spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */,
+   
spiCallback_getSingleResultRowColumnAsFloat4, relTuples);
+   pfree((void *) tableName);
--- End diff --

no need (void *), and does it need to be free?


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208437399
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -193,6 +212,9 @@ typedef struct Relation_Data {
List *files;
Oid partition_parent_relid;
int64 block_count;
+   int port;
--- End diff --

port and hostname is bindded with hivepath for hive protocol, remove them 
all.


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208449241
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -1042,7 +1277,7 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
 * permissions in pg_auth for creating this 
table.
 */
 
-   bool isnull;

+   bool isnull;
Oid  userid = GetUserId();
--- End diff --

incorrect indent


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208449610
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -17561,29 +17887,67 @@ static Datum transformLocationUris(List *locs, 
List* fmtopts, bool isweb, bool i
if (first_uri && uri->protocol == URI_CUSTOM)
{
Oid procOid = InvalidOid;
-   
procOid = 
LookupExtProtocolFunction(uri->customprotocol, 

EXTPTC_FUNC_VALIDATOR, 

false);
 
if (OidIsValid(procOid) && Gp_role == GP_ROLE_DISPATCH)
InvokeProtocolValidation(procOid, 

 uri->customprotocol, 
-   
 iswritable, 
+   
 iswritable, forceCreateDir,

 locs, fmtopts);
+   first_customprotocal = uri->customprotocol;
}

+   if (first_uri && uri->protocol == URI_HDFS)
+   {
+   Oid procOid = InvalidOid;
+
+   procOid = LookupCustomProtocolValidatorFunc("hdfs");
+   if (OidIsValid(procOid) && Gp_role == GP_ROLE_DISPATCH)
+   {
+   InvokeProtocolValidation(procOid,
+   
 uri->customprotocol,
+   
 iswritable, forceCreateDir,
+   
 locs, fmtopts);
+   }
+   }
+
+   if (first_uri && uri->protocol == URI_HIVE)
+   {
+   Oid   procOid = InvalidOid;
+   procOid = LookupCustomProtocolValidatorFunc("hive");
+   if (OidIsValid(procOid) && Gp_role == GP_ROLE_DISPATCH)
+   {
+   InvokeProtocolValidation(procOid,
+   
uri->customprotocol,
+   
iswritable, forceCreateDir,
+   
locs, fmtopts);
+   }
+   }
+
+   if (first_uri && uri->protocol == URI_MAGMA)
--- End diff --

remove magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208436217
  
--- Diff: src/backend/access/external/fileam.c ---
@@ -1381,9 +1603,11 @@ lookupCustomFormatter(char *formatter_name, bool 
iswritable)
ereport(ERROR,

(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
 errmsg("formatter function %s is not 
declared STABLE.",
-formatter_name),
+new_formatter_name),
 errOmitLocation(true)));
 
+   pfree(new_formatter_name);
--- End diff --

add  if (new_formatter_name != NULL)


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435844
  
--- Diff: src/backend/access/external/fileam.c ---
@@ -1290,6 +1397,97 @@ externalgettup_custom(FileScanDesc scan, 
ExternalSelectDesc desc, ScanState *ss)
return NULL;
 }
 
+static HeapTuple
+externalgettup_custom_noextprot(FileScanDesc scan,
+   
ExternalSelectDesc desc,
+   ScanState *ss)
+{
+   HeapTuple   tuple;
+   CopyState   pstate = scan->fs_pstate;
+   FormatterData*  formatter = scan->fs_formatter;
+   boolno_more_data = false;
+   MemoryContext   oldctxt = CurrentMemoryContext;
+
+   Assert(formatter);
+
+   /* while didn't finish processing the entire file */
+   while (!no_more_data)
+   {
+   bool error_caught = false;
+
+   /*
+* Invoke the custom formatter function.
+*/
+   PG_TRY();
+   {
+   Datum   d;
+   FunctionCallInfoDatafcinfo;
--- End diff --

incorrect indent


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208451302
  
--- Diff: src/backend/optimizer/plan/createplan.c ---
@@ -1127,6 +1127,20 @@ bool is_pxf_protocol(Uri *uri)
return false;
 }
 
+bool is_hdfs_protocol(Uri *uri)
+{
+   return uri->protocol == URI_HDFS;
+}
+
+bool is_magma_protocol(Uri *uri)
+{
+   return uri->protocol == URI_MAGMA;
--- End diff --

remove magma and hive


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208451726
  
--- Diff: src/include/utils/uri.h ---
@@ -32,7 +32,11 @@ typedef enum UriProtocol
URI_HTTP,
URI_GPFDIST,
URI_CUSTOM,
-   URI_GPFDISTS
+   URI_GPFDISTS,
+   URI_HDFS,
+   URI_HBASE,
+   URI_MAGMA,
--- End diff --

remove



---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208449174
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -939,38 +983,217 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
char* commandString = NULL;
char  rejectlimittype = '\0';
char  formattype;
+   char* formattername = NULL;
int   rejectlimit = -1;
int   encoding = -1;
int preferred_segment_num = -1;
bool  issreh = false; /* is single 
row error handling requested? */
+   bool  isexternal = 
createExtStmt->isexternal;
bool  iswritable = 
createExtStmt->iswritable;
bool  isweb = createExtStmt->isweb;
+   bool  forceCreateDir = 
createExtStmt->forceCreateDir;
+
+   bool  isExternalHdfs = false;
+   bool  isExternalMagma = false;
+   bool  isExternalHive = 
false;
+   char* location = NULL;
+   int   location_len = NULL;
 
/*
 * now set the parameters for keys/inheritance etc. Most of these are
 * uninteresting for external relations...
 */
-   createStmt->relation = createExtStmt->relation;
-   createStmt->tableElts = createExtStmt->tableElts;
-   createStmt->inhRelations = NIL;
-   createStmt->constraints = NIL;
-   createStmt->options = NIL;
-   createStmt->oncommit = ONCOMMIT_NOOP;
-   createStmt->tablespacename = NULL;
+   createStmt->base = createExtStmt->base;
+   // external table options is not compatible with internal table
+   // set NIL here
+   createStmt->base.options = NIL;
createStmt->policy = createExtStmt->policy; /* policy was set in 
transform */
-   
+
+/*
+* Recognize formatter option if there are some tokens found in parser.
+* This design is to give CREATE EXTERNAL TABLE DDL the flexiblity to
+* support user defined external formatter options.
+*/
+   recognizeExternalRelationFormatterOptions(createExtStmt);
+
+   /*
+* Get tablespace, database, schema for the relation
+*/
+   RangeVar *rel = createExtStmt->base.relation;
+   // get tablespace name for the relation
+   Oid tablespace_id = (gp_upgrade_mode) ? DEFAULTTABLESPACE_OID : 
GetDefaultTablespace();
+   if (!OidIsValid(tablespace_id))
+   {
+   tablespace_id = get_database_dts(MyDatabaseId);
+   }
+   char *tablespace_name = get_tablespace_name(tablespace_id);
+
+   // get database name for the relation
+   char *database_name = rel->catalogname ? rel->catalogname : 
get_database_name(MyDatabaseId);
+
+   // get schema name for the relation
+   char *schema_name = 
get_namespace_name(RangeVarGetCreationNamespace(rel));
+
+   // get table name for the relation
+   char *table_name = rel->relname;
+
+   /*
+* Do some special logic when we use custom
+*/
+   if (exttypeDesc->exttabletype == EXTTBL_TYPE_LOCATION)
+   {
+   if (exttypeDesc->location_list == NIL)
+   {
+   if (dfs_url == NULL)
+   {
+   ereport(ERROR,
+   (errcode(ERRCODE_SYNTAX_ERROR),
+errmsg("Cannot create table on HDFS 
when the service is not available"),
+errhint("Check HDFS service and 
hawq_dfs_url configuration"),
+errOmitLocation(true)));
+   }
+
+   location_len = strlen(PROTOCOL_HDFS) + /* 
hdfs:// */
+  strlen(dfs_url) +   /* 
hawq_dfs_url */
+  // 1 + strlen(filespace_name) + /* '/' + 
filespace name */
+  1 + strlen(tablespace_name) +   /* '/' + 
tablespace name */
+  1 + strlen(database_name) + /* '/' + 
database name */
+  1 + strlen(schema_name) +   /* '/' + 
schema name */
+  1 + strlen(table_name) + 1; /* '/' + 
table name + '\0' */
+
+   char *path;
+
+  

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208451740
  
--- Diff: src/include/utils/uri.h ---
@@ -41,6 +45,10 @@ typedef enum UriProtocol
 #define PROTOCOL_GPFDIST   "gpfdist://"
 #define PROTOCOL_GPFDISTS  "gpfdists://"
 #define PROTOCOL_PXF   "pxf://"
+#define PROTOCOL_HDFS  "hdfs://"
+#define PROTOCOL_HBASE "hbase://"
+#define PROTOCOL_MAGMA "magma://"
--- End diff --

remove


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208451756
  
--- Diff: src/include/utils/uri.h ---
@@ -52,6 +60,10 @@ typedef enum UriProtocol
 #define IS_GPFDISTS_URI(uri_str) (pg_strncasecmp(uri_str, 
PROTOCOL_GPFDISTS, strlen(PROTOCOL_GPFDISTS)) == 0) 
 #define IS_FTP_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_FTP, 
strlen(PROTOCOL_FTP)) == 0)
 #define IS_PXF_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_PXF, 
strlen(PROTOCOL_PXF)) == 0)
+#define IS_HDFS_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_HDFS, 
strlen(PROTOCOL_HDFS)) == 0)
+#define IS_HBASE_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_HBASE, 
strlen(PROTOCOL_HBASE)) == 0)
+#define IS_MAGMA_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_MAGMA, 
strlen(PROTOCOL_MAGMA)) == 0)
--- End diff --

remove


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208437762
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -1579,7 +1700,242 @@ static void ParquetGetSegFileDataLocation(Relation 
relation,
return;
 }
 
+static void InvokeHDFSProtocolBlockLocation(OidprocOid,
+List  *locs,
+List **blockLocations)
+{
+   ExtProtocolValidatorData   *validator_data;
+   FmgrInfo   *validator_udf;
+   FunctionCallInfoDatafcinfo;
+
+   validator_data = (ExtProtocolValidatorData *)
+palloc0 
(sizeof(ExtProtocolValidatorData));
+   validator_udf = palloc(sizeof(FmgrInfo));
+   fmgr_info(procOid, validator_udf);
+
+   validator_data->type= T_ExtProtocolValidatorData;
+   validator_data->url_list= locs;
+   validator_data->format_opts = NULL;
+   validator_data->errmsg  = NULL;
+   validator_data->direction   = EXT_VALIDATE_READ;
+   validator_data->action  = EXT_VALID_ACT_GETBLKLOC;
+
+   InitFunctionCallInfoData(/* FunctionCallInfoData */ fcinfo,
+/* FmgrInfo */ 
validator_udf,
+/* nArgs */ 0,
+/* Call Context */ 
(Node *) validator_data,
+/* ResultSetInfo */ 
NULL);
+
+   /* invoke validator. if this function returns - validation passed */
+   FunctionCallInvoke();
+
+   ExtProtocolBlockLocationData *bls =
+   (ExtProtocolBlockLocationData *)(fcinfo.resultinfo);
+   /* debug output block location. */
+   if (bls != NULL)
+   {
+   ListCell *c;
+   foreach(c, bls->files)
+   {
+   blocklocation_file *blf = (blocklocation_file 
*)(lfirst(c));
+   elog(DEBUG3, "DEBUG LOCATION for %s with %d blocks",
+blf->file_uri, blf->block_num);
+   for ( int i = 0 ; i < blf->block_num ; ++i )
+   {
+   BlockLocation *pbl = &(blf->locations[i]);
+   elog(DEBUG3, "DEBUG LOCATION for block %d : %d, 
"
+INT64_FORMAT ", " 
INT64_FORMAT ", %d",
+i,
+pbl->corrupt, pbl->length, 
pbl->offset,
+pbl->numOfNodes);
+   for ( int j = 0 ; j < pbl->numOfNodes ; ++j )
+   {
+   elog(DEBUG3, "DEBUG LOCATION for block 
%d : %s, %s, %s",
+i,
+pbl->hosts[j], 
pbl->names[j],
+
pbl->topologyPaths[j]);
+   }
+   }
+   }
+   }
 
+   elog(DEBUG3, "after invoking get block location API");
+
+   /* get location data from fcinfo.resultinfo. */
+   if (bls != NULL)
+   {
+   Assert(bls->type == T_ExtProtocolBlockLocationData);
+   while(list_length(bls->files) > 0)
+   {
+   void *v = lfirst(list_head(bls->files));
+   bls->files = list_delete_first(bls->files);
+   *blockLocations = lappend(*blockLocations, v);
+   }
+   }
+   pfree(validator_data);
+   pfree(validator_udf);
+}
+
+Oid
+LookupCustomProtocolBlockLocationFunc(char *protoname)
+{
+   List*   funcname= NIL;
--- End diff --

incorrect indent


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208449650
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -17652,7 +18018,47 @@ static Datum transformLocationUris(List *locs, 
List* fmtopts, bool isweb, bool i
result = (Datum) 0;
 
return result;
+}
+
+static Oid
+LookupCustomProtocolValidatorFunc(char *protoname)
+{
+   List*   funcname= NIL;
--- End diff --

indent


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208437472
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -844,36 +918,17 @@ int64 
get_block_locations_and_claculte_table_size(split_to_segment_mapping_conte
/*
 * We only consider the data stored in HDFS.
 */
-   if (RelationIsAoRows(rel) || RelationIsParquet(rel)) {
-   Relation_Data *rel_data = NULL;
-   /*
-* Get pg_appendonly information for this table.
-*/
-   AppendOnlyEntry *aoEntry = GetAppendOnlyEntry(rel_oid, 
SnapshotNow);
-
-   rel_data = (Relation_Data *) 
palloc(sizeof(Relation_Data));
+   bool isDataStoredInHdfs = dataStoredInHdfs(rel);
+   if (isDataStoredInHdfs ) {
+   GpPolicy *targetPolicy = 
GpPolicyFetch(CurrentMemoryContext, rel_oid);
+   Relation_Data *rel_data = (Relation_Data *) 
palloc(sizeof(Relation_Data));
rel_data->relid = rel_oid;
rel_data->files = NIL;
rel_data->partition_parent_relid = 0;
rel_data->block_count = 0;
-
-   GpPolicy *targetPolicy = NULL;
-   targetPolicy = GpPolicyFetch(CurrentMemoryContext, 
rel_oid);
-   /*
-* Based on the pg_appendonly information, calculate 
the data
-* location information associated with this relation.
-*/
-   if (RelationIsAoRows(rel)) {
-   rel_data->type = DATALOCALITY_APPENDONLY;
-   AOGetSegFileDataLocation(rel, aoEntry, 
ActiveSnapshot, context,
-   aoEntry->splitsize, rel_data, 
,
-   , targetPolicy);
-   } else {
-   rel_data->type = DATALOCALITY_PARQUET;
-   ParquetGetSegFileDataLocation(rel, aoEntry, 
ActiveSnapshot, context,
-   context->split_size, rel_data, 
,
-   , targetPolicy);
-   }
+   rel_data->port = 0;
--- End diff --

NO "port and hostname"


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208451046
  
--- Diff: src/backend/nodes/outfuncs.c ---
@@ -2170,16 +2170,27 @@ _outCreateExternalStmt(StringInfo str, 
CreateExternalStmt *node)
 {
WRITE_NODE_TYPE("CREATEEXTERNALSTMT");
 
-   WRITE_NODE_FIELD(relation);
-   WRITE_NODE_FIELD(tableElts);
+   WRITE_CHAR_FIELD(base.relKind);
+   WRITE_NODE_FIELD(base.relation);
+   WRITE_NODE_FIELD(base.tableElts);
+   WRITE_NODE_FIELD(base.inhRelations);
+   WRITE_NODE_FIELD(base.constraints);
+   WRITE_NODE_FIELD(base.options);
+   WRITE_ENUM_FIELD(base.oncommit, OnCommitAction);
+   WRITE_STRING_FIELD(base.tablespacename);
+   WRITE_NODE_FIELD(base.distributedBy);
+   WRITE_BOOL_FIELD(base.is_part_child);
+   WRITE_BOOL_FIELD(base.is_add_part);
+   WRITE_NODE_FIELD(base.partitionBy);
--- End diff --

check the struct with master branch


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208436246
  
--- Diff: src/backend/access/external/fileam.c ---
@@ -1664,7 +1888,9 @@ FunctionCallPrepareFormatter(FunctionCallInfoData*
fcinfo,
 Relation   
rel,
 TupleDesc  
tupDesc,
 FmgrInfo   
   *convFuncs,
-Oid   
*typioparams)
+Oid   
*typioparams,
--- End diff --

incorrect indent


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-09 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208438223
  
--- Diff: src/backend/commands/analyze.c ---
@@ -989,17 +1013,20 @@ static List* analyzableRelations(bool rootonly, List 
**fullRelOids)
while (HeapTupleIsValid(tuple = caql_getnext(pcqCtx)))
{
Oid candidateOid = HeapTupleGetOid(tuple);
-   if (analyzePermitted(candidateOid)
-   && candidateOid != 
StatisticRelationId)
+   bool isExternalHDFSORMAGMA = 
isExternalHDFSORMAGMAProtocol(candidateOid);
+   if (analyzePermitted(candidateOid) &&
+   candidateOid != StatisticRelationId &&
+   isExternalHDFSORMAGMA)
--- End diff --

remove magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208438791
  
--- Diff: src/include/catalog/pg_exttable.h ---
@@ -164,9 +164,12 @@ GetExtTableEntry(Oid relid);
 extern void
 RemoveExtTableEntry(Oid relid);
 
-#define CustomFormatType 'b'
-#define TextFormatType 't'
-#define CsvFormatType 'c'
+#define CustomFormatType'b'
+#define TextFormatType  't'
--- End diff --

There should be no hard code format type in pluggable storage framwork


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208438657
  
--- Diff: src/include/utils/uri.h ---
@@ -52,6 +60,10 @@ typedef enum UriProtocol
 #define IS_GPFDISTS_URI(uri_str) (pg_strncasecmp(uri_str, 
PROTOCOL_GPFDISTS, strlen(PROTOCOL_GPFDISTS)) == 0) 
 #define IS_FTP_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_FTP, 
strlen(PROTOCOL_FTP)) == 0)
 #define IS_PXF_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_PXF, 
strlen(PROTOCOL_PXF)) == 0)
+#define IS_HDFS_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_HDFS, 
strlen(PROTOCOL_HDFS)) == 0)
+#define IS_HBASE_URI(uri_str) (pg_strncasecmp(uri_str, PROTOCOL_HBASE, 
strlen(PROTOCOL_HBASE)) == 0)
--- End diff --

No HBase, Hive, and Magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208438576
  
--- Diff: src/include/utils/uri.h ---
@@ -41,6 +45,10 @@ typedef enum UriProtocol
 #define PROTOCOL_GPFDIST   "gpfdist://"
 #define PROTOCOL_GPFDISTS  "gpfdists://"
 #define PROTOCOL_PXF   "pxf://"
+#define PROTOCOL_HDFS  "hdfs://"
+#define PROTOCOL_HBASE "hbase://"
--- End diff --

No Hive and Magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208438342
  
--- Diff: src/include/utils/uri.h ---
@@ -32,7 +32,11 @@ typedef enum UriProtocol
URI_HTTP,
URI_GPFDIST,
URI_CUSTOM,
-   URI_GPFDISTS
+   URI_GPFDISTS,
+   URI_HDFS,
--- End diff --

No Hive and Magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208438291
  
--- Diff: src/include/nodes/parsenodes.h ---
@@ -1459,8 +1463,10 @@ typedef struct SharedStorageOpStmt
  */
 typedef enum ExtTableType
 {
-   EXTTBL_TYPE_LOCATION,   /* table defined with LOCATION clause */
-   EXTTBL_TYPE_EXECUTE /* table defined with EXECUTE 
clause */
+EXTTBL_TYPE_LOCATION, /* table defined with LOCATION clause */
+EXTTBL_TYPE_EXECUTE,  /* table defined with EXECUTE clause */
+EXTTBL_TYPE_MAGMA,/* table defined with MAGMA */
--- End diff --

No magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208438235
  
--- Diff: src/include/access/xact.h ---
@@ -173,6 +173,33 @@ typedef struct XidBuffer
 extern XidBuffer subxbuf;
 extern File subxip_file;
 
+/*
--- End diff --

No transaction for pluggable storage framework for now


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208438165
  
--- Diff: src/include/access/formatter.h ---
@@ -36,10 +36,19 @@
 typedef enum FmtNotification
 {
FMT_NONE,
+   FMT_DONE,
FMT_NEED_MORE_DATA
 
 } FmtNotification;
 
+typedef enum FmtActionMask
--- End diff --

The pluggable storage framework do not use this structure, remove it


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208437581
  
--- Diff: src/backend/optimizer/plan/createplan.c ---
@@ -1127,6 +1127,20 @@ bool is_pxf_protocol(Uri *uri)
return false;
 }
 
+bool is_hdfs_protocol(Uri *uri)
+{
+   return uri->protocol == URI_HDFS;
+}
+
+bool is_magma_protocol(Uri *uri)
--- End diff --

No magma and hive protocol


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208437403
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -18041,11 +18646,11 @@ static Datum transformFormatOpts(char formattype, 
List *formatOpts, int numcols,
  1 +   
/* space*/
  4 + 1 + 1 + strlen(null_print) + 1 +  /* "null 'str'" 
*/
  1 +   
/* space*/
- 6 + 1 + 1 + strlen(escape) + 1;   /* 
"escape 'c' or 'off' */
+ 6 + 1 + 1 + strlen(escape) + 1;   /* 
"escape 'c' or 'off' */
--- End diff --

Do not format the code if you don't change it logic


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208437352
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -17845,14 +18448,16 @@ static Datum transformFormatOpts(char formattype, 
List *formatOpts, int numcols,
 
Assert(fmttype_is_custom(formattype) || 
   fmttype_is_text(formattype) ||
-  fmttype_is_csv(formattype));
+  fmttype_is_csv(formattype) ||
--- End diff --

The check of format options should not be hard coded.


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208436378
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -17615,14 +17979,16 @@ static Datum transformLocationUris(List *locs, 
List* fmtopts, bool isweb, bool i
 errhint("Writable external tables may 
use \'gpfdist(s)\' URIs only."),
   
errOmitLocation(true)));
 
-   if(uri->protocol != URI_CUSTOM && iswritable && 
strchr(uri->path, '*'))
+   if(uri->protocol != URI_CUSTOM && uri->protocol != URI_MAGMA && 
iswritable && strchr(uri->path, '*'))
--- End diff --

No magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208436303
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -17528,6 +17845,15 @@ static Datum transformLocationUris(List *locs, 
List* fmtopts, bool isweb, bool i
errOmitLocation(true)));
}
 
+   // Oushu will not support pxf anymore
--- End diff --

Apache HAWQ still have pxf available, remove this


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208436195
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -1222,20 +1529,24 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
elog(ERROR, "unrecognized node type: %d",
 nodeTag(dencoding->arg));
}
+   if (!dencoding && isExternalMagma)
--- End diff --

No magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208436174
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -1210,6 +1507,16 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
else if (IsA(dencoding->arg, String))
{
encoding_name = strVal(dencoding->arg);
+
+   /* custom format */
+   if (!fmttype_is_text(formattype) && 
!fmttype_is_csv(formattype) &&
--- End diff --

There should be no hard coded format name in pluggable storage framework


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208436164
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -1199,6 +1489,13 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
{
encoding = intVal(dencoding->arg);
encoding_name = pg_encoding_to_char(encoding);
+
+   /* custom format */
+   if (!fmttype_is_text(formattype) && 
!fmttype_is_csv(formattype))
--- End diff --

There should be no hard coded format name in pluggable storage framework


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435854
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -1174,6 +1426,44 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
fmtErrTblOid = InvalidOid; /* no err tbl was requested 
*/
}
 
+   /*
+* Parse and validate FORMAT clause.
+*
+* We force formatter as 'custom' if it is external hdfs protocol
+*/
+   formattype = (isExternalHdfs || isExternalMagma || isExternalHive) ?
+'b' : transformFormatType(createExtStmt->format);
+
+   if ((formattype == 'b') &&
--- End diff --

There should be no hard coded format name in pluggable storage framework


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435805
  
--- Diff: src/backend/access/external/fileam.c ---
@@ -1290,6 +1397,97 @@ externalgettup_custom(FileScanDesc scan, 
ExternalSelectDesc desc, ScanState *ss)
return NULL;
 }
 
+static HeapTuple
+externalgettup_custom_noextprot(FileScanDesc scan,
+   
ExternalSelectDesc desc,
+   ScanState *ss)
+{
+   HeapTuple   tuple;
+   CopyState   pstate = scan->fs_pstate;
--- End diff --

incorrect indent


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435735
  
--- Diff: src/backend/access/external/fileam.c ---
@@ -782,6 +864,28 @@ external_insert(ExternalInsertDesc extInsertDesc, 
TupleTableSlot *tupTableSlot)
 void
 external_insert_finish(ExternalInsertDesc extInsertDesc)
 {
+   /* Tell formatter to close */
+   if (extInsertDesc->ext_formatter_data != NULL &&
+   (extInsertDesc->ext_formatter_data->fmt_mask & FMT_NEEDEXTBUFF) 
== 0)
+   {
+   Datum d;
+   FunctionCallInfoData fcinfo;
+
+   extInsertDesc->ext_formatter_data->fmt_mask |= FMT_WRITE_END;
+
+   /* per call formatter prep */
+   FunctionCallPrepareFormatter(,
--- End diff --

incorrect indent


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435758
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -1105,53 +1340,70 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
char*   protname = 
uri->customprotocol;
Oid ptcId = 
LookupExtProtocolOid(protname, false);
AclResult   aclresult;
-   
+
/* Check we have the right permissions 
on this protocol */
if (!pg_extprotocol_ownercheck(ptcId, 
ownerId))
-   {   
+   {
AclMode mode = (iswritable ? 
ACL_INSERT : ACL_SELECT);
-   
+
aclresult = 
pg_extprotocol_aclcheck(ptcId, ownerId, mode);
-   
+
if (aclresult != ACLCHECK_OK)

aclcheck_error(aclresult, ACL_KIND_EXTPROTOCOL, protname);
}
}
+   /* magma follow the same ack check as HDFS */
+   else if (uri->protocol == URI_HDFS || 
uri->protocol == URI_MAGMA)
--- End diff --

No magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435710
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -939,38 +983,217 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
char* commandString = NULL;
char  rejectlimittype = '\0';
char  formattype;
+   char* formattername = NULL;
int   rejectlimit = -1;
int   encoding = -1;
int preferred_segment_num = -1;
bool  issreh = false; /* is single 
row error handling requested? */
+   bool  isexternal = 
createExtStmt->isexternal;
bool  iswritable = 
createExtStmt->iswritable;
bool  isweb = createExtStmt->isweb;
+   bool  forceCreateDir = 
createExtStmt->forceCreateDir;
+
+   bool  isExternalHdfs = false;
+   bool  isExternalMagma = false;
+   bool  isExternalHive = 
false;
+   char* location = NULL;
+   int   location_len = NULL;
 
/*
 * now set the parameters for keys/inheritance etc. Most of these are
 * uninteresting for external relations...
 */
-   createStmt->relation = createExtStmt->relation;
-   createStmt->tableElts = createExtStmt->tableElts;
-   createStmt->inhRelations = NIL;
-   createStmt->constraints = NIL;
-   createStmt->options = NIL;
-   createStmt->oncommit = ONCOMMIT_NOOP;
-   createStmt->tablespacename = NULL;
+   createStmt->base = createExtStmt->base;
+   // external table options is not compatible with internal table
+   // set NIL here
+   createStmt->base.options = NIL;
createStmt->policy = createExtStmt->policy; /* policy was set in 
transform */
-   
+
+/*
+* Recognize formatter option if there are some tokens found in parser.
+* This design is to give CREATE EXTERNAL TABLE DDL the flexiblity to
+* support user defined external formatter options.
+*/
+   recognizeExternalRelationFormatterOptions(createExtStmt);
+
+   /*
+* Get tablespace, database, schema for the relation
+*/
+   RangeVar *rel = createExtStmt->base.relation;
+   // get tablespace name for the relation
+   Oid tablespace_id = (gp_upgrade_mode) ? DEFAULTTABLESPACE_OID : 
GetDefaultTablespace();
+   if (!OidIsValid(tablespace_id))
+   {
+   tablespace_id = get_database_dts(MyDatabaseId);
+   }
+   char *tablespace_name = get_tablespace_name(tablespace_id);
+
+   // get database name for the relation
+   char *database_name = rel->catalogname ? rel->catalogname : 
get_database_name(MyDatabaseId);
+
+   // get schema name for the relation
+   char *schema_name = 
get_namespace_name(RangeVarGetCreationNamespace(rel));
+
+   // get table name for the relation
+   char *table_name = rel->relname;
+
+   /*
+* Do some special logic when we use custom
+*/
+   if (exttypeDesc->exttabletype == EXTTBL_TYPE_LOCATION)
+   {
+   if (exttypeDesc->location_list == NIL)
+   {
+   if (dfs_url == NULL)
+   {
+   ereport(ERROR,
+   (errcode(ERRCODE_SYNTAX_ERROR),
+errmsg("Cannot create table on HDFS 
when the service is not available"),
+errhint("Check HDFS service and 
hawq_dfs_url configuration"),
+errOmitLocation(true)));
+   }
+
+   location_len = strlen(PROTOCOL_HDFS) + /* 
hdfs:// */
+  strlen(dfs_url) +   /* 
hawq_dfs_url */
+  // 1 + strlen(filespace_name) + /* '/' + 
filespace name */
+  1 + strlen(tablespace_name) +   /* '/' + 
tablespace name */
+  1 + strlen(database_name) + /* '/' + 
database name */
+  1 + strlen(schema_name) +   /* '/' + 
schema name */
+  1 + strlen(table_name) + 1; /* '/' + 
table name + '\0' */
+
+   char *path;
+
+ 

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435664
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -939,38 +983,217 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
char* commandString = NULL;
char  rejectlimittype = '\0';
char  formattype;
+   char* formattername = NULL;
int   rejectlimit = -1;
int   encoding = -1;
int preferred_segment_num = -1;
bool  issreh = false; /* is single 
row error handling requested? */
+   bool  isexternal = 
createExtStmt->isexternal;
bool  iswritable = 
createExtStmt->iswritable;
bool  isweb = createExtStmt->isweb;
+   bool  forceCreateDir = 
createExtStmt->forceCreateDir;
+
+   bool  isExternalHdfs = false;
+   bool  isExternalMagma = false;
+   bool  isExternalHive = 
false;
+   char* location = NULL;
+   int   location_len = NULL;
 
/*
 * now set the parameters for keys/inheritance etc. Most of these are
 * uninteresting for external relations...
 */
-   createStmt->relation = createExtStmt->relation;
-   createStmt->tableElts = createExtStmt->tableElts;
-   createStmt->inhRelations = NIL;
-   createStmt->constraints = NIL;
-   createStmt->options = NIL;
-   createStmt->oncommit = ONCOMMIT_NOOP;
-   createStmt->tablespacename = NULL;
+   createStmt->base = createExtStmt->base;
+   // external table options is not compatible with internal table
+   // set NIL here
+   createStmt->base.options = NIL;
createStmt->policy = createExtStmt->policy; /* policy was set in 
transform */
-   
+
+/*
+* Recognize formatter option if there are some tokens found in parser.
+* This design is to give CREATE EXTERNAL TABLE DDL the flexiblity to
+* support user defined external formatter options.
+*/
+   recognizeExternalRelationFormatterOptions(createExtStmt);
+
+   /*
+* Get tablespace, database, schema for the relation
+*/
+   RangeVar *rel = createExtStmt->base.relation;
+   // get tablespace name for the relation
+   Oid tablespace_id = (gp_upgrade_mode) ? DEFAULTTABLESPACE_OID : 
GetDefaultTablespace();
+   if (!OidIsValid(tablespace_id))
+   {
+   tablespace_id = get_database_dts(MyDatabaseId);
+   }
+   char *tablespace_name = get_tablespace_name(tablespace_id);
+
+   // get database name for the relation
+   char *database_name = rel->catalogname ? rel->catalogname : 
get_database_name(MyDatabaseId);
+
+   // get schema name for the relation
+   char *schema_name = 
get_namespace_name(RangeVarGetCreationNamespace(rel));
+
+   // get table name for the relation
+   char *table_name = rel->relname;
+
+   /*
+* Do some special logic when we use custom
+*/
+   if (exttypeDesc->exttabletype == EXTTBL_TYPE_LOCATION)
+   {
+   if (exttypeDesc->location_list == NIL)
+   {
+   if (dfs_url == NULL)
+   {
+   ereport(ERROR,
+   (errcode(ERRCODE_SYNTAX_ERROR),
+errmsg("Cannot create table on HDFS 
when the service is not available"),
+errhint("Check HDFS service and 
hawq_dfs_url configuration"),
+errOmitLocation(true)));
+   }
+
+   location_len = strlen(PROTOCOL_HDFS) + /* 
hdfs:// */
+  strlen(dfs_url) +   /* 
hawq_dfs_url */
+  // 1 + strlen(filespace_name) + /* '/' + 
filespace name */
+  1 + strlen(tablespace_name) +   /* '/' + 
tablespace name */
+  1 + strlen(database_name) + /* '/' + 
database name */
+  1 + strlen(schema_name) +   /* '/' + 
schema name */
+  1 + strlen(table_name) + 1; /* '/' + 
table name + '\0' */
+
+   char *path;
+
+ 

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435628
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -939,38 +983,217 @@ DefineExternalRelation(CreateExternalStmt 
*createExtStmt)
char* commandString = NULL;
char  rejectlimittype = '\0';
char  formattype;
+   char* formattername = NULL;
int   rejectlimit = -1;
int   encoding = -1;
int preferred_segment_num = -1;
bool  issreh = false; /* is single 
row error handling requested? */
+   bool  isexternal = 
createExtStmt->isexternal;
bool  iswritable = 
createExtStmt->iswritable;
bool  isweb = createExtStmt->isweb;
+   bool  forceCreateDir = 
createExtStmt->forceCreateDir;
+
+   bool  isExternalHdfs = false;
+   bool  isExternalMagma = false;
+   bool  isExternalHive = 
false;
+   char* location = NULL;
+   int   location_len = NULL;
 
/*
 * now set the parameters for keys/inheritance etc. Most of these are
 * uninteresting for external relations...
 */
-   createStmt->relation = createExtStmt->relation;
-   createStmt->tableElts = createExtStmt->tableElts;
-   createStmt->inhRelations = NIL;
-   createStmt->constraints = NIL;
-   createStmt->options = NIL;
-   createStmt->oncommit = ONCOMMIT_NOOP;
-   createStmt->tablespacename = NULL;
+   createStmt->base = createExtStmt->base;
+   // external table options is not compatible with internal table
+   // set NIL here
+   createStmt->base.options = NIL;
createStmt->policy = createExtStmt->policy; /* policy was set in 
transform */
-   
+
+/*
+* Recognize formatter option if there are some tokens found in parser.
+* This design is to give CREATE EXTERNAL TABLE DDL the flexiblity to
+* support user defined external formatter options.
+*/
+   recognizeExternalRelationFormatterOptions(createExtStmt);
+
+   /*
+* Get tablespace, database, schema for the relation
+*/
+   RangeVar *rel = createExtStmt->base.relation;
+   // get tablespace name for the relation
+   Oid tablespace_id = (gp_upgrade_mode) ? DEFAULTTABLESPACE_OID : 
GetDefaultTablespace();
+   if (!OidIsValid(tablespace_id))
+   {
+   tablespace_id = get_database_dts(MyDatabaseId);
+   }
+   char *tablespace_name = get_tablespace_name(tablespace_id);
+
+   // get database name for the relation
+   char *database_name = rel->catalogname ? rel->catalogname : 
get_database_name(MyDatabaseId);
+
+   // get schema name for the relation
+   char *schema_name = 
get_namespace_name(RangeVarGetCreationNamespace(rel));
+
+   // get table name for the relation
+   char *table_name = rel->relname;
+
+   /*
+* Do some special logic when we use custom
+*/
+   if (exttypeDesc->exttabletype == EXTTBL_TYPE_LOCATION)
+   {
+   if (exttypeDesc->location_list == NIL)
+   {
+   if (dfs_url == NULL)
+   {
+   ereport(ERROR,
+   (errcode(ERRCODE_SYNTAX_ERROR),
+errmsg("Cannot create table on HDFS 
when the service is not available"),
+errhint("Check HDFS service and 
hawq_dfs_url configuration"),
+errOmitLocation(true)));
+   }
+
+   location_len = strlen(PROTOCOL_HDFS) + /* 
hdfs:// */
+  strlen(dfs_url) +   /* 
hawq_dfs_url */
+  // 1 + strlen(filespace_name) + /* '/' + 
filespace name */
+  1 + strlen(tablespace_name) +   /* '/' + 
tablespace name */
+  1 + strlen(database_name) + /* '/' + 
database name */
+  1 + strlen(schema_name) +   /* '/' + 
schema name */
+  1 + strlen(table_name) + 1; /* '/' + 
table name + '\0' */
+
+   char *path;
+
+ 

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435571
  
--- Diff: src/backend/commands/tablecmds.c ---
@@ -632,9 +665,20 @@ DefineRelation_int(CreateStmt *stmt,
 */
descriptor = BuildDescForRelation(schema);
 
-   localHasOids = interpretOidsOption(stmt->options);
+   localHasOids = interpretOidsOption(stmt->base.options);
descriptor->tdhasoid = (localHasOids || parentOidCount > 0);
 
+   /*
+* Check supported data types for pluggable format, i.e., orc
+* Need to remove this check if all data types are supported for orc 
format.
+*/
+   if ((relkind == RELKIND_RELATION) && (relstorage == 
RELSTORAGE_EXTERNAL) &&
+   formattername && pg_strncasecmp(formattername, "text", 
strlen("text")) &&
--- End diff --

TEXT/CSV format in pluggable storage should have no data type constraint, 
should remove this.


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435430
  
--- Diff: src/backend/access/external/fileam.c ---
@@ -305,11 +308,53 @@ external_beginscan(ExternalScan *extScan,
scan->errcontext.previous = error_context_stack;
 
//pgstat_initstats(relation);
-
+   external_populate_formatter_actionmask(scan->fs_pstate, 
scan->fs_formatter);
return scan;
 }
 
 
+/* 
+ * external_populate_formatter_actionmask
+ * 
+ */
+void external_populate_formatter_actionmask(struct CopyStateData *pstate,
--- End diff --

use CopyState pstate instead.


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435255
  
--- Diff: src/backend/commands/analyze.c ---
@@ -442,6 +455,17 @@ void analyzeStmt(VacuumStmt *stmt, List *relids, int 
preferred_seg_num)
"Please run ANALYZE on 
the root partition table.",

get_rel_name(relationOid;
}
+   else if (!isExternalHDFSORMAGMAProtocol(relationOid))
+   {
+   /*
+* Support analyze for external table.
+* For now, HDFS protocol external 
table is supported.
+*/
+   ereport(WARNING,
+(errmsg("skipping \"%s\" --- cannot 
analyze external table with non-HDFS or non-MAGMA protocol. "
--- End diff --

No "Magma"


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435117
  
--- Diff: src/backend/commands/analyze.c ---
@@ -442,6 +455,17 @@ void analyzeStmt(VacuumStmt *stmt, List *relids, int 
preferred_seg_num)
"Please run ANALYZE on 
the root partition table.",

get_rel_name(relationOid;
}
+   else if (!isExternalHDFSORMAGMAProtocol(relationOid))
--- End diff --

Use "isExternalHDFSProtocol"


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208435034
  
--- Diff: src/backend/commands/analyze.c ---
@@ -153,9 +160,15 @@ static List*buildExplicitAttributeNames(Oid 
relationOid, VacuumStmt *stmt);
 static void gp_statistics_estimate_reltuples_relpages_heap(Relation rel, 
float4 *reltuples, float4 *relpages);
 static void gp_statistics_estimate_reltuples_relpages_ao_rows(Relation 
rel, float4 *reltuples, float4 *relpages);
 static void gp_statistics_estimate_reltuples_relpages_parquet(Relation 
rel, float4 *reltuples, float4 *relpages);
+static void gp_statistics_estimate_reltuples_relpages_external(Relation 
rel, float4 *relTuples, float4 *relPages);
 static void analyzeEstimateReltuplesRelpages(Oid relationOid, float4 
*relTuples, float4 *relPages, bool rootonly);
 static void analyzeEstimateIndexpages(Oid relationOid, Oid indexOid, 
float4 *indexPages);
 
+static void getExternalRelTuples(Oid relationOid, float4 *relTuples);
+static void getExternalRelPages(Oid relationOid, float4 *relPages , 
Relation rel);
+static float4 getExtrelPagesHDFS(Uri *uri);
+static bool isExternalHDFSORMAGMAProtocol(Oid relOid);
--- End diff --

Implement only HDFS protocol, not HDFS and Magma


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434847
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -1579,7 +1700,242 @@ static void ParquetGetSegFileDataLocation(Relation 
relation,
return;
 }
 
+static void InvokeHDFSProtocolBlockLocation(OidprocOid,
+List  *locs,
+List **blockLocations)
+{
+   ExtProtocolValidatorData   *validator_data;
+   FmgrInfo   *validator_udf;
+   FunctionCallInfoDatafcinfo;
+
+   validator_data = (ExtProtocolValidatorData *)
+palloc0 
(sizeof(ExtProtocolValidatorData));
+   validator_udf = palloc(sizeof(FmgrInfo));
+   fmgr_info(procOid, validator_udf);
+
+   validator_data->type= T_ExtProtocolValidatorData;
+   validator_data->url_list= locs;
+   validator_data->format_opts = NULL;
+   validator_data->errmsg  = NULL;
+   validator_data->direction   = EXT_VALIDATE_READ;
+   validator_data->action  = EXT_VALID_ACT_GETBLKLOC;
+
+   InitFunctionCallInfoData(/* FunctionCallInfoData */ fcinfo,
+/* FmgrInfo */ 
validator_udf,
+/* nArgs */ 0,
+/* Call Context */ 
(Node *) validator_data,
+/* ResultSetInfo */ 
NULL);
+
+   /* invoke validator. if this function returns - validation passed */
+   FunctionCallInvoke();
+
+   ExtProtocolBlockLocationData *bls =
+   (ExtProtocolBlockLocationData *)(fcinfo.resultinfo);
+   /* debug output block location. */
+   if (bls != NULL)
+   {
+   ListCell *c;
+   foreach(c, bls->files)
+   {
+   blocklocation_file *blf = (blocklocation_file 
*)(lfirst(c));
+   elog(DEBUG3, "DEBUG LOCATION for %s with %d blocks",
+blf->file_uri, blf->block_num);
+   for ( int i = 0 ; i < blf->block_num ; ++i )
+   {
+   BlockLocation *pbl = &(blf->locations[i]);
+   elog(DEBUG3, "DEBUG LOCATION for block %d : %d, 
"
+INT64_FORMAT ", " 
INT64_FORMAT ", %d",
+i,
+pbl->corrupt, pbl->length, 
pbl->offset,
+pbl->numOfNodes);
+   for ( int j = 0 ; j < pbl->numOfNodes ; ++j )
+   {
+   elog(DEBUG3, "DEBUG LOCATION for block 
%d : %s, %s, %s",
+i,
+pbl->hosts[j], 
pbl->names[j],
+
pbl->topologyPaths[j]);
+   }
+   }
+   }
+   }
 
+   elog(DEBUG3, "after invoking get block location API");
+
+   /* get location data from fcinfo.resultinfo. */
+   if (bls != NULL)
+   {
+   Assert(bls->type == T_ExtProtocolBlockLocationData);
+   while(list_length(bls->files) > 0)
+   {
+   void *v = lfirst(list_head(bls->files));
+   bls->files = list_delete_first(bls->files);
+   *blockLocations = lappend(*blockLocations, v);
+   }
+   }
+   pfree(validator_data);
+   pfree(validator_udf);
+}
+
+Oid
+LookupCustomProtocolBlockLocationFunc(char *protoname)
+{
+   List*   funcname= NIL;
+   Oid procOid = InvalidOid;
+   Oid argList[1];
+   Oid returnOid;
+
+   char*   new_func_name = (char *)palloc0(strlen(protoname) + 16);
+   sprintf(new_func_name, "%s_blocklocation", protoname);
+   funcname = lappend(funcname, makeString(new_func_name));
+   returnOid = VOIDOID;
+   procOid = LookupFuncName(funcname, 0, argList, true);
+
+   if (!OidIsValid(procOid))
+   ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION),
+   errmsg("protocol function %s 
was not found.",
+   new_func_name),
+   errhint("Create it with CREATE 
FUNCTION."),
+   

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434889
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -4039,6 +4398,9 @@ calculate_planner_segment_num(Query *query, 
QueryResourceLife resourceLife,
result->planner_segments = 0;
result->datalocalityInfo = makeStringInfo();
 result->datalocalityTime = 0;
+ //   result->hivehost = NULL;
--- End diff --

Remove


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434912
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -4104,6 +4468,12 @@ calculate_planner_segment_num(Query *query, 
QueryResourceLife resourceLife,
 
/* get block location and calculate relation size*/
get_block_locations_and_claculte_table_size();
+   if(context.chsl_context.relations){
+   Relation_Data* tmp = (Relation_Data*) 
lfirst(context.chsl_context.relations->tail);
+   //  result->hivehost = (tmp->hostname) ? 
pstrdup(tmp->hostname):(char*)NULL;
--- End diff --

Remove


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434835
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -1579,7 +1700,242 @@ static void ParquetGetSegFileDataLocation(Relation 
relation,
return;
 }
 
+static void InvokeHDFSProtocolBlockLocation(OidprocOid,
+List  *locs,
+List **blockLocations)
+{
+   ExtProtocolValidatorData   *validator_data;
+   FmgrInfo   *validator_udf;
+   FunctionCallInfoDatafcinfo;
+
+   validator_data = (ExtProtocolValidatorData *)
+palloc0 
(sizeof(ExtProtocolValidatorData));
+   validator_udf = palloc(sizeof(FmgrInfo));
+   fmgr_info(procOid, validator_udf);
+
+   validator_data->type= T_ExtProtocolValidatorData;
+   validator_data->url_list= locs;
+   validator_data->format_opts = NULL;
+   validator_data->errmsg  = NULL;
+   validator_data->direction   = EXT_VALIDATE_READ;
+   validator_data->action  = EXT_VALID_ACT_GETBLKLOC;
+
+   InitFunctionCallInfoData(/* FunctionCallInfoData */ fcinfo,
+/* FmgrInfo */ 
validator_udf,
+/* nArgs */ 0,
+/* Call Context */ 
(Node *) validator_data,
+/* ResultSetInfo */ 
NULL);
+
+   /* invoke validator. if this function returns - validation passed */
+   FunctionCallInvoke();
+
+   ExtProtocolBlockLocationData *bls =
+   (ExtProtocolBlockLocationData *)(fcinfo.resultinfo);
+   /* debug output block location. */
+   if (bls != NULL)
+   {
+   ListCell *c;
+   foreach(c, bls->files)
+   {
+   blocklocation_file *blf = (blocklocation_file 
*)(lfirst(c));
+   elog(DEBUG3, "DEBUG LOCATION for %s with %d blocks",
+blf->file_uri, blf->block_num);
+   for ( int i = 0 ; i < blf->block_num ; ++i )
+   {
+   BlockLocation *pbl = &(blf->locations[i]);
+   elog(DEBUG3, "DEBUG LOCATION for block %d : %d, 
"
+INT64_FORMAT ", " 
INT64_FORMAT ", %d",
+i,
+pbl->corrupt, pbl->length, 
pbl->offset,
+pbl->numOfNodes);
+   for ( int j = 0 ; j < pbl->numOfNodes ; ++j )
+   {
+   elog(DEBUG3, "DEBUG LOCATION for block 
%d : %s, %s, %s",
+i,
+pbl->hosts[j], 
pbl->names[j],
+
pbl->topologyPaths[j]);
+   }
+   }
+   }
+   }
 
+   elog(DEBUG3, "after invoking get block location API");
+
+   /* get location data from fcinfo.resultinfo. */
+   if (bls != NULL)
+   {
+   Assert(bls->type == T_ExtProtocolBlockLocationData);
+   while(list_length(bls->files) > 0)
+   {
+   void *v = lfirst(list_head(bls->files));
+   bls->files = list_delete_first(bls->files);
+   *blockLocations = lappend(*blockLocations, v);
+   }
+   }
+   pfree(validator_data);
+   pfree(validator_udf);
+}
+
+Oid
+LookupCustomProtocolBlockLocationFunc(char *protoname)
+{
+   List*   funcname= NIL;
+   Oid procOid = InvalidOid;
+   Oid argList[1];
+   Oid returnOid;
+
+   char*   new_func_name = (char *)palloc0(strlen(protoname) + 16);
+   sprintf(new_func_name, "%s_blocklocation", protoname);
+   funcname = lappend(funcname, makeString(new_func_name));
+   returnOid = VOIDOID;
+   procOid = LookupFuncName(funcname, 0, argList, true);
+
+   if (!OidIsValid(procOid))
+   ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION),
+   errmsg("protocol function %s 
was not found.",
+   new_func_name),
+   errhint("Create it with CREATE 
FUNCTION."),
+   

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434815
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -1579,7 +1700,242 @@ static void ParquetGetSegFileDataLocation(Relation 
relation,
return;
 }
 
+static void InvokeHDFSProtocolBlockLocation(OidprocOid,
+List  *locs,
+List **blockLocations)
+{
+   ExtProtocolValidatorData   *validator_data;
+   FmgrInfo   *validator_udf;
+   FunctionCallInfoDatafcinfo;
+
+   validator_data = (ExtProtocolValidatorData *)
+palloc0 
(sizeof(ExtProtocolValidatorData));
+   validator_udf = palloc(sizeof(FmgrInfo));
+   fmgr_info(procOid, validator_udf);
+
+   validator_data->type= T_ExtProtocolValidatorData;
+   validator_data->url_list= locs;
+   validator_data->format_opts = NULL;
+   validator_data->errmsg  = NULL;
+   validator_data->direction   = EXT_VALIDATE_READ;
+   validator_data->action  = EXT_VALID_ACT_GETBLKLOC;
+
+   InitFunctionCallInfoData(/* FunctionCallInfoData */ fcinfo,
+/* FmgrInfo */ 
validator_udf,
+/* nArgs */ 0,
+/* Call Context */ 
(Node *) validator_data,
+/* ResultSetInfo */ 
NULL);
+
+   /* invoke validator. if this function returns - validation passed */
+   FunctionCallInvoke();
+
+   ExtProtocolBlockLocationData *bls =
+   (ExtProtocolBlockLocationData *)(fcinfo.resultinfo);
+   /* debug output block location. */
+   if (bls != NULL)
+   {
+   ListCell *c;
+   foreach(c, bls->files)
+   {
+   blocklocation_file *blf = (blocklocation_file 
*)(lfirst(c));
+   elog(DEBUG3, "DEBUG LOCATION for %s with %d blocks",
+blf->file_uri, blf->block_num);
+   for ( int i = 0 ; i < blf->block_num ; ++i )
+   {
+   BlockLocation *pbl = &(blf->locations[i]);
+   elog(DEBUG3, "DEBUG LOCATION for block %d : %d, 
"
+INT64_FORMAT ", " 
INT64_FORMAT ", %d",
+i,
+pbl->corrupt, pbl->length, 
pbl->offset,
+pbl->numOfNodes);
+   for ( int j = 0 ; j < pbl->numOfNodes ; ++j )
+   {
+   elog(DEBUG3, "DEBUG LOCATION for block 
%d : %s, %s, %s",
+i,
+pbl->hosts[j], 
pbl->names[j],
+
pbl->topologyPaths[j]);
+   }
+   }
+   }
+   }
 
+   elog(DEBUG3, "after invoking get block location API");
+
+   /* get location data from fcinfo.resultinfo. */
+   if (bls != NULL)
+   {
+   Assert(bls->type == T_ExtProtocolBlockLocationData);
+   while(list_length(bls->files) > 0)
+   {
+   void *v = lfirst(list_head(bls->files));
+   bls->files = list_delete_first(bls->files);
+   *blockLocations = lappend(*blockLocations, v);
+   }
+   }
+   pfree(validator_data);
+   pfree(validator_udf);
+}
+
+Oid
+LookupCustomProtocolBlockLocationFunc(char *protoname)
+{
+   List*   funcname= NIL;
+   Oid procOid = InvalidOid;
+   Oid argList[1];
+   Oid returnOid;
+
+   char*   new_func_name = (char *)palloc0(strlen(protoname) + 16);
+   sprintf(new_func_name, "%s_blocklocation", protoname);
+   funcname = lappend(funcname, makeString(new_func_name));
+   returnOid = VOIDOID;
+   procOid = LookupFuncName(funcname, 0, argList, true);
+
+   if (!OidIsValid(procOid))
+   ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION),
+   errmsg("protocol function %s 
was not found.",
+   new_func_name),
+   errhint("Create it with CREATE 
FUNCTION."),
+   

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434723
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -1579,7 +1700,242 @@ static void ParquetGetSegFileDataLocation(Relation 
relation,
return;
 }
 
+static void InvokeHDFSProtocolBlockLocation(OidprocOid,
+List  *locs,
+List **blockLocations)
+{
+   ExtProtocolValidatorData   *validator_data;
+   FmgrInfo   *validator_udf;
+   FunctionCallInfoDatafcinfo;
+
+   validator_data = (ExtProtocolValidatorData *)
+palloc0 
(sizeof(ExtProtocolValidatorData));
--- End diff --

Indent need to be fixed


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434493
  
--- Diff: src/backend/access/external/fileam.c ---
@@ -86,14 +86,17 @@ static void InitParseState(CopyState pstate, Relation 
relation,
   char *uri, int rejectlimit,
   bool islimitinrows, Oid 
fmterrtbl, ResultRelSegFileInfo *segfileinfo, int encoding);
 
-static void FunctionCallPrepareFormatter(FunctionCallInfoData* fcinfo,
-   
 intnArgs,
-   
 CopyState  pstate,
-   
 FormatterData* formatter,
-   
 Relation   rel,
-   
 TupleDesc  tupDesc,
-   
 FmgrInfo  *convFuncs,
-   
 Oid   *typioparams);
+static void
+FunctionCallPrepareFormatter(FunctionCallInfoData* fcinfo,
+int
nArgs,
+CopyState  
pstate,
+FormatterData  
   *formatter,
+Relation   
rel,
+TupleDesc  
tupDesc,
+FmgrInfo   
   *convFuncs,
+Oid   
*typioparams,
--- End diff --

indent is not consistent


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434427
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -844,36 +918,17 @@ int64 
get_block_locations_and_claculte_table_size(split_to_segment_mapping_conte
/*
 * We only consider the data stored in HDFS.
 */
-   if (RelationIsAoRows(rel) || RelationIsParquet(rel)) {
-   Relation_Data *rel_data = NULL;
-   /*
-* Get pg_appendonly information for this table.
-*/
-   AppendOnlyEntry *aoEntry = GetAppendOnlyEntry(rel_oid, 
SnapshotNow);
-
-   rel_data = (Relation_Data *) 
palloc(sizeof(Relation_Data));
+   bool isDataStoredInHdfs = dataStoredInHdfs(rel);
+   if (isDataStoredInHdfs ) {
+   GpPolicy *targetPolicy = 
GpPolicyFetch(CurrentMemoryContext, rel_oid);
+   Relation_Data *rel_data = (Relation_Data *) 
palloc(sizeof(Relation_Data));
rel_data->relid = rel_oid;
rel_data->files = NIL;
rel_data->partition_parent_relid = 0;
rel_data->block_count = 0;
-
-   GpPolicy *targetPolicy = NULL;
-   targetPolicy = GpPolicyFetch(CurrentMemoryContext, 
rel_oid);
-   /*
-* Based on the pg_appendonly information, calculate 
the data
-* location information associated with this relation.
-*/
-   if (RelationIsAoRows(rel)) {
-   rel_data->type = DATALOCALITY_APPENDONLY;
-   AOGetSegFileDataLocation(rel, aoEntry, 
ActiveSnapshot, context,
-   aoEntry->splitsize, rel_data, 
,
-   , targetPolicy);
-   } else {
-   rel_data->type = DATALOCALITY_PARQUET;
-   ParquetGetSegFileDataLocation(rel, aoEntry, 
ActiveSnapshot, context,
-   context->split_size, rel_data, 
,
-   , targetPolicy);
-   }
+   rel_data->port = 0;
+   rel_data->hivepath = NULL;
--- End diff --

No "rel_data->hivepath = NULL;"


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread Librago
Github user Librago commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434274
  
--- Diff: contrib/exthdfs/exthdfs.c ---
@@ -0,0 +1,472 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+
+#include "postgres.h"
+
+#include "common.h"
+#include "access/extprotocol.h"
+#include "cdb/cdbdatalocality.h"
+#include "storage/fd.h"
+#include "storage/filesystem.h"
+#include "utils/uri.h"
+
+
+
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(hdfsprotocol_blocklocation);
+PG_FUNCTION_INFO_V1(hdfsprotocol_validate);
+
+Datum hdfsprotocol_blocklocation(PG_FUNCTION_ARGS);
+Datum hdfsprotocol_validate(PG_FUNCTION_ARGS);
+
+Datum hdfsprotocol_blocklocation(PG_FUNCTION_ARGS)
+{
+
+   // Build the result instance
+   int nsize = 0;
+   int numOfBlock = 0;
+   ExtProtocolBlockLocationData *bldata =
+   palloc0(sizeof(ExtProtocolBlockLocationData));
+   if (bldata == NULL)
+   {
+   elog(ERROR, "hdfsprotocol_blocklocation : "
+"cannot allocate due to no memory");
+   }
+   bldata->type = T_ExtProtocolBlockLocationData;
+   fcinfo->resultinfo = bldata;
+
+   ExtProtocolValidatorData *pvalidator_data = (ExtProtocolValidatorData *)
+   
(fcinfo->context);
+
+
+// Parse URI of the first location, we expect all locations uses the 
same
+// name node server. This is checked in validation function.
+
+   char *first_uri_str =
+   (char *)strVal(lfirst(list_head(pvalidator_data->url_list)));
+   Uri *uri = ParseExternalTableUri(first_uri_str);
+
+   elog(DEBUG3, "hdfsprotocol_blocklocation : "
+"extracted HDFS name node address %s:%d",
+uri->hostname, uri->port);
+
+   // Create file system instance
+   hdfsFS fs = hdfsConnect(uri->hostname, uri->port);
+   if (fs == NULL)
+   {
+   elog(ERROR, "hdfsprotocol_blocklocation : "
+   "failed to create HDFS instance to 
connect to %s:%d",
+   uri->hostname, uri->port);
+   }
+
+   // Clean up uri instance as we don't need it any longer
+   pfree(uri);
+
+   // Check all locations to get files to fetch location.
+   ListCell *lc = NULL;
+   foreach(lc, pvalidator_data->url_list)
+   {
+   // Parse current location URI.
+   char *url = (char *)strVal(lfirst(lc));
+   Uri *uri = ParseExternalTableUri(url);
+   if (uri == NULL)
+   {
+   elog(ERROR, "hdfsprotocol_blocklocation : "
+   "invalid URI encountered %s", 
url);
+   }
+
+//
+// NOTICE: We temporarily support only directories as 
locations. We plan
+//to extend the logic to specifying single file as one 
location
+// very soon.
+
+
+   // get files contained in the path.
+   hdfsFileInfo *fiarray = hdfsListDirectory(fs, uri->path,);
+   if (fiarray == NULL)
+   {
+   elog(ERROR, "hdfsprotocol_blocklocation : "
+   "failed to get files of path 
%s",
+   uri->path);
+   }
+
+   int i = 0 ;
+   // Call block location api to get data location for each file
+   for (i = 0 ; i < nsize ; i++)
+   {
+   hdfsFileInfo *fi = [i];
+
+   // break condition of this for loop
+   if (fi == NULL) {break;}
+
+   // Build file name full path.
+   const char *fname = 

[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434291
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -193,6 +212,9 @@ typedef struct Relation_Data {
List *files;
Oid partition_parent_relid;
int64 block_count;
+   int port;
+   char *hostname;
+   char *hivepath;
--- End diff --

No "char *hivepath"


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434247
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -123,10 +132,20 @@ typedef struct File_Split {
int64 logiceof;
int host;
bool is_local_read;
+   char *ext_file_uri;
+   char *lower_bound_inc;
+   char *upper_bound_exc;
+   int fs_lb_len;
+   int fs_ub_len;
 } File_Split;
 
 typedef enum DATALOCALITY_RELATION_TYPE {
-   DATALOCALITY_APPENDONLY, DATALOCALITY_PARQUET, DATALOCALITY_UNKNOWN
+   DATALOCALITY_APPENDONLY,
+   DATALOCALITY_PARQUET,
+   DATALOCALITY_HDFS,
+   DATALOCALITY_HIVE,
--- End diff --

No below relation type:
```
DATALOCALITY_HIVE,
DATALOCALITY_MAGMA,
```


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208434124
  
--- Diff: src/backend/cdb/cdbdatalocality.c ---
@@ -123,10 +132,20 @@ typedef struct File_Split {
int64 logiceof;
int host;
bool is_local_read;
+   char *ext_file_uri;
--- End diff --

Remove below element:
```
char *lower_bound_inc;
char *upper_bound_exc;
int fs_lb_len;
int fs_ub_len;
```


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208433943
  
--- Diff: src/backend/catalog/heap.c ---
@@ -2543,8 +2551,55 @@ heap_drop_with_catalog(Oid relid)
/*
 * External table? If so, delete the pg_exttable tuple.
 */
+/* if (is_external_rel)
--- End diff --

Remove the line commented here


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208433876
  
--- Diff: src/backend/catalog/cdb_external_extensions.sql ---
@@ -47,3 +47,38 @@ LANGUAGE C STABLE;
 CREATE OR REPLACE FUNCTION fixedwidth_out(record) RETURNS bytea 
 AS '$libdir/fixedwidth.so', 'fixedwidth_out'
 LANGUAGE C STABLE;
+
+--
+-- external HDFS
+--
+CREATE OR REPLACE FUNCTION hdfs_validate() RETURNS void
+AS '$libdir/exthdfs.so', 'hdfsprotocol_validate'
+LANGUAGE C STABLE;
+
+CREATE OR REPLACE FUNCTION hdfs_blocklocation() RETURNS void
+AS '$libdir/exthdfs.so', 'hdfsprotocol_blocklocation'
+LANGUAGE C STABLE;
+
+--
--- End diff --

No TEXT/CSV format code here


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208433762
  
--- Diff: contrib/exthdfs/exthdfs.c ---
@@ -0,0 +1,472 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+
+#include "postgres.h"
+
+#include "common.h"
+#include "access/extprotocol.h"
+#include "cdb/cdbdatalocality.h"
+#include "storage/fd.h"
+#include "storage/filesystem.h"
+#include "utils/uri.h"
+
+
+
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(hdfsprotocol_blocklocation);
+PG_FUNCTION_INFO_V1(hdfsprotocol_validate);
+
+Datum hdfsprotocol_blocklocation(PG_FUNCTION_ARGS);
+Datum hdfsprotocol_validate(PG_FUNCTION_ARGS);
+
+Datum hdfsprotocol_blocklocation(PG_FUNCTION_ARGS)
+{
+
+   // Build the result instance
+   int nsize = 0;
+   int numOfBlock = 0;
+   ExtProtocolBlockLocationData *bldata =
+   palloc0(sizeof(ExtProtocolBlockLocationData));
+   if (bldata == NULL)
+   {
+   elog(ERROR, "hdfsprotocol_blocklocation : "
+"cannot allocate due to no memory");
+   }
+   bldata->type = T_ExtProtocolBlockLocationData;
+   fcinfo->resultinfo = bldata;
+
+   ExtProtocolValidatorData *pvalidator_data = (ExtProtocolValidatorData *)
+   
(fcinfo->context);
+
+
+// Parse URI of the first location, we expect all locations uses the 
same
+// name node server. This is checked in validation function.
+
+   char *first_uri_str =
+   (char *)strVal(lfirst(list_head(pvalidator_data->url_list)));
+   Uri *uri = ParseExternalTableUri(first_uri_str);
+
+   elog(DEBUG3, "hdfsprotocol_blocklocation : "
+"extracted HDFS name node address %s:%d",
--- End diff --

Indent should follow the convention as previous code


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208433663
  
--- Diff: contrib/exthdfs/exthdfs.c ---
@@ -0,0 +1,472 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
--- End diff --

Remove redundant blank line here


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208433613
  
--- Diff: contrib/exthdfs/common.h ---
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+#ifndef _EXTHDFS_COMMON_H_
+#define _EXTHDFS_COMMON_H_
+
+#include "postgres.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "access/extprotocol.h"
+#include "access/fileam.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_exttable.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "miscadmin.h"
+
--- End diff --

Remove redundant blank line here


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208433457
  
--- Diff: contrib/extfmtcsv/Makefile ---
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
--- End diff --

Remove extfmtcsv related code if the current commit is for hdfs protocol.


---


[GitHub] incubator-hawq pull request #1384: HAWQ-1628. Add HDFS protocol for external...

2018-08-07 Thread huor
Github user huor commented on a diff in the pull request:

https://github.com/apache/incubator-hawq/pull/1384#discussion_r208433287
  
--- Diff: contrib/Makefile ---
@@ -9,7 +9,9 @@ WANTED_DIRS = \
extprotocol \
gp_cancel_query \
formatter_fixedwidth \
-   hawq-hadoop
+   exthdfs\
--- End diff --

Use
```
exthdfs \
hawq-hadoop
```
Instead of
```
exthdfs\
extfmtcsv\
hawq-hadoop\
```


---