postgrespro
diff --git a/‎doc/src/sgml/fdwhandler.sgml‎
Lines changed: 214 additions & 18 deletions b/‎doc/src/sgml/fdwhandler.sgml‎
Lines changed: 214 additions & 18 deletions
diff --git a/‎src/backend/executor/execMain.c‎
Lines changed: 61 additions & 18 deletions b/‎src/backend/executor/execMain.c‎
Lines changed: 61 additions & 18 deletions
@@ -665,6 +665,108 @@ IsForeignRelUpdatable (Relation rel);
 
    </sect2>
 
+   <sect2 id="fdw-callbacks-row-locking">
+    <title>FDW Routines For Row Locking</title>
+
+    <para>
+     If an FDW wishes to support <firstterm>late row locking</> (as described
+     in <xref linkend="fdw-row-locking">), it must provide the following
+     callback functions:
+    </para>
+
+    <para>
+<programlisting>
+RowMarkType
+GetForeignRowMarkType (RangeTblEntry *rte,
+                       LockClauseStrength strength);
+</programlisting>
+
+     Report which row-marking option to use for a foreign table.
+     <literal>rte</> is the <structname>RangeTblEntry</> node for the table
+     and <literal>strength</> describes the lock strength requested by the
+     relevant <literal>FOR UPDATE/SHARE</> clause, if any.  The result must be
+     a member of the <literal>RowMarkType</> enum type.
+    </para>
+
+    <para>
+     This function is called during query planning for each foreign table that
+     appears in an <command>UPDATE</>, <command>DELETE</>, or <command>SELECT
+     FOR UPDATE/SHARE</> query and is not the target of <command>UPDATE</>
+     or <command>DELETE</>.
+    </para>
+
+    <para>
+     If the <function>GetForeignRowMarkType</> pointer is set to
+     <literal>NULL</>, the <literal>ROW_MARK_COPY</> option is always used.
+     (This implies that <function>RefetchForeignRow</> will never be called,
+     so it need not be provided either.)
+    </para>
+
+    <para>
+     See <xref linkend="fdw-row-locking"> for more information.
+    </para>
+
+    <para>
+<programlisting>
+HeapTuple
+RefetchForeignRow (EState *estate,
+                   ExecRowMark *erm,
+                   Datum rowid,
+                   bool *updated);
+</programlisting>
+
+     Re-fetch one tuple from the foreign table, after locking it if required.
+     <literal>estate</> is global execution state for the query.
+     <literal>erm</> is the <structname>ExecRowMark</> struct describing
+     the target foreign table and the row lock type (if any) to acquire.
+     <literal>rowid</> identifies the tuple to be fetched.
+     <literal>updated</> is an output parameter.
+    </para>
+
+    <para>
+     This function should return a palloc'ed copy of the fetched tuple,
+     or <literal>NULL</> if the row lock couldn't be obtained.  The row lock
+     type to acquire is defined by <literal>erm-&gt;markType</>, which is the
+     value previously returned by <function>GetForeignRowMarkType</>.
+     (<literal>ROW_MARK_REFERENCE</> means to just re-fetch the tuple without
+     acquiring any lock, and <literal>ROW_MARK_COPY</> will never be seen by
+     this routine.)
+    </para>
+
+    <para>
+     In addition, <literal>*updated</> should be set to <literal>true</>
+     if what was fetched was an updated version of the tuple rather than
+     the same version previously obtained.  (If the FDW cannot be sure about
+     this, always returning <literal>true</> is recommended.)
+    </para>
+
+    <para>
+     Note that by default, failure to acquire a row lock should result in
+     raising an error; a <literal>NULL</> return is only appropriate if
+     the <literal>SKIP LOCKED</> option is specified
+     by <literal>erm-&gt;waitPolicy</>.
+    </para>
+
+    <para>
+     The <literal>rowid</> is the <structfield>ctid</> value previously read
+     for the row to be re-fetched.  Although the <literal>rowid</> value is
+     passed as a <type>Datum</>, it can currently only be a <type>tid</>.  The
+     function API is chosen in hopes that it may be possible to allow other
+     datatypes for row IDs in future.
+    </para>
+
+    <para>
+     If the <function>RefetchForeignRow</> pointer is set to
+     <literal>NULL</>, attempts to re-fetch rows will fail
+     with an error message.
+    </para>
+
+    <para>
+     See <xref linkend="fdw-row-locking"> for more information.
+    </para>
+
+   </sect2>
+
    <sect2 id="fdw-callbacks-explain">
     <title>FDW Routines for <command>EXPLAIN</></title>
 
@@ -1092,31 +1194,125 @@ GetForeignServerByName(const char *name, bool missing_ok);
      structures that <function>copyObject</> knows how to copy.
     </para>
 
-    <para>
-     For an <command>UPDATE</> or <command>DELETE</> against an external data
-     source that supports concurrent updates, it is recommended that the
-     <literal>ForeignScan</> operation lock the rows that it fetches, perhaps
-     via the equivalent of <command>SELECT FOR UPDATE</>.  The FDW may also
-     choose to lock rows at fetch time when the foreign table is referenced
-     in a <command>SELECT FOR UPDATE/SHARE</>; if it does not, the
-     <literal>FOR UPDATE</> or <literal>FOR SHARE</> option is essentially a
-     no-op so far as the foreign table is concerned.  This behavior may yield
-     semantics slightly different from operations on local tables, where row
-     locking is customarily delayed as long as possible: remote rows may get
-     locked even though they subsequently fail locally-applied restriction or
-     join conditions.  However, matching the local semantics exactly would
-     require an additional remote access for every row, and might be
-     impossible anyway depending on what locking semantics the external data
-     source provides.
-    </para>
-
     <para>
      <command>INSERT</> with an <literal>ON CONFLICT</> clause does not
      support specifying the conflict target, as remote constraints are not
      locally known. This in turn implies that <literal>ON CONFLICT DO
      UPDATE</> is not supported, since the specification is mandatory there.
     </para>
 
+   </sect1>
+
+   <sect1 id="fdw-row-locking">
+    <title>Row Locking in Foreign Data Wrappers</title>
+
+    <para>
+     If an FDW's underlying storage mechanism has a concept of locking
+     individual rows to prevent concurrent updates of those rows, it is
+     usually worthwhile for the FDW to perform row-level locking with as
+     close an approximation as practical to the semantics used in
+     ordinary <productname>PostgreSQL</> tables.  There are multiple
+     considerations involved in this.
+    </para>
+
+    <para>
+     One key decision to be made is whether to perform <firstterm>early
+     locking</> or <firstterm>late locking</>.  In early locking, a row is
+     locked when it is first retrieved from the underlying store, while in
+     late locking, the row is locked only when it is known that it needs to
+     be locked.  (The difference arises because some rows may be discarded by
+     locally-checked restriction or join conditions.)  Early locking is much
+     simpler and avoids extra round trips to a remote store, but it can cause
+     locking of rows that need not have been locked, resulting in reduced
+     concurrency or even unexpected deadlocks.  Also, late locking is only
+     possible if the row to be locked can be uniquely re-identified later.
+     Preferably the row identifier should identify a specific version of the
+     row, as <productname>PostgreSQL</> TIDs do.
+    </para>
+
+    <para>
+     By default, <productname>PostgreSQL</> ignores locking considerations
+     when interfacing to FDWs, but an FDW can perform early locking without
+     any explicit support from the core code.  The API functions described
+     in <xref linkend="fdw-callbacks-row-locking">, which were added
+     in <productname>PostgreSQL</> 9.5, allow an FDW to use late locking if
+     it wishes.
+    </para>
+
+    <para>
+     An additional consideration is that in <literal>READ COMMITTED</>
+     isolation mode, <productname>PostgreSQL</> may need to re-check
+     restriction and join conditions against an updated version of some
+     target tuple.  Rechecking join conditions requires re-obtaining copies
+     of the non-target rows that were previously joined to the target tuple.
+     When working with standard <productname>PostgreSQL</> tables, this is
+     done by including the TIDs of the non-target tables in the column list
+     projected through the join, and then re-fetching non-target rows when
+     required.  This approach keeps the join data set compact, but it
+     requires inexpensive re-fetch capability, as well as a TID that can
+     uniquely identify the row version to be re-fetched.  By default,
+     therefore, the approach used with foreign tables is to include a copy of
+     the entire row fetched from a foreign table in the column list projected
+     through the join.  This puts no special demands on the FDW but can
+     result in reduced performance of merge and hash joins.  An FDW that is
+     capable of meeting the re-fetch requirements can choose to do it the
+     first way.
+    </para>
+
+    <para>
+     For an <command>UPDATE</> or <command>DELETE</> on a foreign table, it
+     is recommended that the <literal>ForeignScan</> operation on the target
+     table perform early locking on the rows that it fetches, perhaps via the
+     equivalent of <command>SELECT FOR UPDATE</>.  An FDW can detect whether
+     a table is an <command>UPDATE</>/<command>DELETE</> target at plan time
+     by comparing its relid to <literal>root-&gt;parse-&gt;resultRelation</>,
+     or at execution time by using <function>ExecRelationIsTargetRelation()</>.
+     An alternative possibility is to perform late locking within the
+     <function>ExecForeignUpdate</> or <function>ExecForeignDelete</>
+     callback, but no special support is provided for this.
+    </para>
+
+    <para>
+     For foreign tables that are specified to be locked by a <command>SELECT
+     FOR UPDATE/SHARE</> command, the <literal>ForeignScan</> operation can
+     again perform early locking by fetching tuples with the equivalent
+     of <command>SELECT FOR UPDATE/SHARE</>.  To perform late locking
+     instead, provide the callback functions defined
+     in <xref linkend="fdw-callbacks-row-locking">.
+     In <function>GetForeignRowMarkType</>, select rowmark option
+     <literal>ROW_MARK_EXCLUSIVE</>, <literal>ROW_MARK_NOKEYEXCLUSIVE</>,
+     <literal>ROW_MARK_SHARE</>, or <literal>ROW_MARK_KEYSHARE</> depending
+     on the requested lock strength.  (The core code will act the same
+     regardless of which of these four options you choose.)
+     Elsewhere, you can detect whether a foreign table was specified to be
+     locked by this type of command by using <function>get_plan_rowmark</> at
+     plan time, or <function>ExecFindRowMark</> at execution time; you must
+     check not only whether a non-null rowmark struct is returned, but that
+     its <structfield>strength</> field is not <literal>LCS_NONE</>.
+    </para>
+
+    <para>
+     Lastly, for foreign tables that are used in an <command>UPDATE</>,
+     <command>DELETE</> or <command>SELECT FOR UPDATE/SHARE</> command but
+     are not specified to be row-locked, you can override the default choice
+     to copy entire rows by having <function>GetForeignRowMarkType</> select
+     option <literal>ROW_MARK_REFERENCE</> when it sees lock strength
+     <literal>LCS_NONE</>.  This will cause <function>RefetchForeignRow</> to
+     be called with that value for <structfield>markType</>; it should then
+     re-fetch the row without acquiring any new lock.  (If you have
+     a <function>GetForeignRowMarkType</> function but don't wish to re-fetch
+     unlocked rows, select option <literal>ROW_MARK_COPY</>
+     for <literal>LCS_NONE</>.)
+    </para>
+
+    <para>
+     See <filename>src/include/nodes/lockoptions.h</>, the comments
+     for <type>RowMarkType</> and <type>PlanRowMark</>
+     in <filename>src/include/nodes/plannodes.h</>, and the comments for
+     <type>ExecRowMark</> in <filename>src/include/nodes/execnodes.h</> for
+     additional information.
+    </para>
+
   </sect1>
 
  </chapter>
@@ -898,8 +898,11 @@ InitPlan(QueryDesc *queryDesc, int eflags)
 		erm->prti = rc->prti;
 		erm->rowmarkId = rc->rowmarkId;
 		erm->markType = rc->markType;
+		erm->strength = rc->strength;
 		erm->waitPolicy = rc->waitPolicy;
+		erm->ermActive = false;
 		ItemPointerSetInvalid(&(erm->curCtid));
+		erm->ermExtra = NULL;
 		estate->es_rowMarks = lappend(estate->es_rowMarks, erm);
 	}
 
@@ -1143,6 +1146,8 @@ CheckValidResultRel(Relation resultRel, CmdType operation)
 static void
 CheckValidRowMarkRel(Relation rel, RowMarkType markType)
 {
+	FdwRoutine *fdwroutine;
+
 	switch (rel->rd_rel->relkind)
 	{
 		case RELKIND_RELATION:
@@ -1178,11 +1183,13 @@ CheckValidRowMarkRel(Relation rel, RowMarkType markType)
 							  RelationGetRelationName(rel))));
 			break;
 		case RELKIND_FOREIGN_TABLE:
-			/* Should not get here; planner should have used ROW_MARK_COPY */
-			ereport(ERROR,
-					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-					 errmsg("cannot lock rows in foreign table \"%s\"",
-							RelationGetRelationName(rel))));
+			/* Okay only if the FDW supports it */
+			fdwroutine = GetFdwRoutineForRelation(rel, false);
+			if (fdwroutine->RefetchForeignRow == NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot lock rows in foreign table \"%s\"",
+								RelationGetRelationName(rel))));
 			break;
 		default:
 			ereport(ERROR,
@@ -2005,9 +2012,11 @@ ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo)
 
 /*
  * ExecFindRowMark -- find the ExecRowMark struct for given rangetable index
+ *
+ * If no such struct, either return NULL or throw error depending on missing_ok
  */
 ExecRowMark *
-ExecFindRowMark(EState *estate, Index rti)
+ExecFindRowMark(EState *estate, Index rti, bool missing_ok)
 {
 	ListCell   *lc;
 
@@ -2018,8 +2027,9 @@ ExecFindRowMark(EState *estate, Index rti)
 		if (erm->rti == rti)
 			return erm;
 	}
-	elog(ERROR, "failed to find ExecRowMark for rangetable index %u", rti);
-	return NULL;				/* keep compiler quiet */
+	if (!missing_ok)
+		elog(ERROR, "failed to find ExecRowMark for rangetable index %u", rti);
+	return NULL;
 }
 
 /*
@@ -2530,7 +2540,7 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate)
 
 		if (erm->markType == ROW_MARK_REFERENCE)
 		{
-			Buffer		buffer;
+			HeapTuple	copyTuple;
 
 			Assert(erm->relation != NULL);
 
@@ -2541,17 +2551,50 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate)
 			/* non-locked rels could be on the inside of outer joins */
 			if (isNull)
 				continue;
-			tuple.t_self = *((ItemPointer) DatumGetPointer(datum));
 
-			/* okay, fetch the tuple */
-			if (!heap_fetch(erm->relation, SnapshotAny, &tuple, &buffer,
-							false, NULL))
-				elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck");
+			/* fetch requests on foreign tables must be passed to their FDW */
+			if (erm->relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+			{
+				FdwRoutine *fdwroutine;
+				bool		updated = false;
 
-			/* successful, copy and store tuple */
-			EvalPlanQualSetTuple(epqstate, erm->rti,
-								 heap_copytuple(&tuple));
-			ReleaseBuffer(buffer);
+				fdwroutine = GetFdwRoutineForRelation(erm->relation, false);
+				/* this should have been checked already, but let's be safe */
+				if (fdwroutine->RefetchForeignRow == NULL)
+					ereport(ERROR,
+							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							 errmsg("cannot lock rows in foreign table \"%s\"",
+									RelationGetRelationName(erm->relation))));
+				copyTuple = fdwroutine->RefetchForeignRow(epqstate->estate,
+														  erm,
+														  datum,
+														  &updated);
+				if (copyTuple == NULL)
+					elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck");
+
+				/*
+				 * Ideally we'd insist on updated == false here, but that
+				 * assumes that FDWs can track that exactly, which they might
+				 * not be able to.  So just ignore the flag.
+				 */
+			}
+			else
+			{
+				/* ordinary table, fetch the tuple */
+				Buffer		buffer;
+
+				tuple.t_self = *((ItemPointer) DatumGetPointer(datum));
+				if (!heap_fetch(erm->relation, SnapshotAny, &tuple, &buffer,
+								false, NULL))
+					elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck");
+
+				/* successful, copy tuple */
+				copyTuple = heap_copytuple(&tuple);
+				ReleaseBuffer(buffer);
+			}
+
+			/* store tuple */
+			EvalPlanQualSetTuple(epqstate, erm->rti, copyTuple);
 		}
 		else
 		{