@@ -1059,6 +1059,148 @@ ldelete:;
10591059 return NULL ;
10601060}
10611061
1062+ /*
1063+ * ExecCrossPartitionUpdate --- Move an updated tuple to another partition.
1064+ *
1065+ * This works by first deleting the old tuple from the current partition,
1066+ * followed by inserting the new tuple into the root parent table, that is,
1067+ * mtstate->rootResultRelInfo. It will be re-routed from there to the
1068+ * correct partition.
1069+ *
1070+ * Returns true if the tuple has been successfully moved, or if it's found
1071+ * that the tuple was concurrently deleted so there's nothing more to do
1072+ * for the caller.
1073+ *
1074+ * False is returned if the tuple we're trying to move is found to have been
1075+ * concurrently updated. In that case, the caller must to check if the
1076+ * updated tuple that's returned in *retry_slot still needs to be re-routed,
1077+ * and call this function again or perform a regular update accordingly.
1078+ */
1079+ static bool
1080+ ExecCrossPartitionUpdate (ModifyTableState * mtstate ,
1081+ ResultRelInfo * resultRelInfo ,
1082+ ItemPointer tupleid , HeapTuple oldtuple ,
1083+ TupleTableSlot * slot , TupleTableSlot * planSlot ,
1084+ EPQState * epqstate , bool canSetTag ,
1085+ TupleTableSlot * * retry_slot ,
1086+ TupleTableSlot * * inserted_tuple )
1087+ {
1088+ EState * estate = mtstate -> ps .state ;
1089+ PartitionTupleRouting * proute = mtstate -> mt_partition_tuple_routing ;
1090+ int map_index ;
1091+ TupleConversionMap * tupconv_map ;
1092+ TupleConversionMap * saved_tcs_map = NULL ;
1093+ bool tuple_deleted ;
1094+ TupleTableSlot * epqslot = NULL ;
1095+
1096+ * inserted_tuple = NULL ;
1097+ * retry_slot = NULL ;
1098+
1099+ /*
1100+ * Disallow an INSERT ON CONFLICT DO UPDATE that causes the original row
1101+ * to migrate to a different partition. Maybe this can be implemented
1102+ * some day, but it seems a fringe feature with little redeeming value.
1103+ */
1104+ if (((ModifyTable * ) mtstate -> ps .plan )-> onConflictAction == ONCONFLICT_UPDATE )
1105+ ereport (ERROR ,
1106+ (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
1107+ errmsg ("invalid ON UPDATE specification" ),
1108+ errdetail ("The result tuple would appear in a different partition than the original tuple." )));
1109+
1110+ /*
1111+ * When an UPDATE is run on a leaf partition, we will not have partition
1112+ * tuple routing set up. In that case, fail with partition constraint
1113+ * violation error.
1114+ */
1115+ if (proute == NULL )
1116+ ExecPartitionCheckEmitError (resultRelInfo , slot , estate );
1117+
1118+ /*
1119+ * Row movement, part 1. Delete the tuple, but skip RETURNING processing.
1120+ * We want to return rows from INSERT.
1121+ */
1122+ ExecDelete (mtstate , resultRelInfo , tupleid , oldtuple , planSlot ,
1123+ epqstate , estate ,
1124+ false, /* processReturning */
1125+ false, /* canSetTag */
1126+ true, /* changingPart */
1127+ & tuple_deleted , & epqslot );
1128+
1129+ /*
1130+ * For some reason if DELETE didn't happen (e.g. trigger prevented it, or
1131+ * it was already deleted by self, or it was concurrently deleted by
1132+ * another transaction), then we should skip the insert as well;
1133+ * otherwise, an UPDATE could cause an increase in the total number of
1134+ * rows across all partitions, which is clearly wrong.
1135+ *
1136+ * For a normal UPDATE, the case where the tuple has been the subject of a
1137+ * concurrent UPDATE or DELETE would be handled by the EvalPlanQual
1138+ * machinery, but for an UPDATE that we've translated into a DELETE from
1139+ * this partition and an INSERT into some other partition, that's not
1140+ * available, because CTID chains can't span relation boundaries. We
1141+ * mimic the semantics to a limited extent by skipping the INSERT if the
1142+ * DELETE fails to find a tuple. This ensures that two concurrent
1143+ * attempts to UPDATE the same tuple at the same time can't turn one tuple
1144+ * into two, and that an UPDATE of a just-deleted tuple can't resurrect
1145+ * it.
1146+ */
1147+ if (!tuple_deleted )
1148+ {
1149+ /*
1150+ * epqslot will be typically NULL. But when ExecDelete() finds that
1151+ * another transaction has concurrently updated the same row, it
1152+ * re-fetches the row, skips the delete, and epqslot is set to the
1153+ * re-fetched tuple slot. In that case, we need to do all the checks
1154+ * again.
1155+ */
1156+ if (TupIsNull (epqslot ))
1157+ return true;
1158+ else
1159+ {
1160+ * retry_slot = ExecFilterJunk (resultRelInfo -> ri_junkFilter , epqslot );
1161+ return false;
1162+ }
1163+ }
1164+
1165+ /*
1166+ * resultRelInfo is one of the per-subplan resultRelInfos. So we should
1167+ * convert the tuple into root's tuple descriptor, since ExecInsert()
1168+ * starts the search from root. The tuple conversion map list is in the
1169+ * order of mtstate->resultRelInfo[], so to retrieve the one for this
1170+ * resultRel, we need to know the position of the resultRel in
1171+ * mtstate->resultRelInfo[].
1172+ */
1173+ map_index = resultRelInfo - mtstate -> resultRelInfo ;
1174+ Assert (map_index >= 0 && map_index < mtstate -> mt_nplans );
1175+ tupconv_map = tupconv_map_for_subplan (mtstate , map_index );
1176+ if (tupconv_map != NULL )
1177+ slot = execute_attr_map_slot (tupconv_map -> attrMap ,
1178+ slot ,
1179+ mtstate -> mt_root_tuple_slot );
1180+
1181+ /*
1182+ * ExecInsert() may scribble on mtstate->mt_transition_capture, so save
1183+ * the currently active map.
1184+ */
1185+ if (mtstate -> mt_transition_capture )
1186+ saved_tcs_map = mtstate -> mt_transition_capture -> tcs_map ;
1187+
1188+ /* Tuple routing starts from the root table. */
1189+ Assert (mtstate -> rootResultRelInfo != NULL );
1190+ * inserted_tuple = ExecInsert (mtstate , mtstate -> rootResultRelInfo , slot ,
1191+ planSlot , estate , canSetTag );
1192+
1193+ /* Clear the INSERT's tuple and restore the saved map. */
1194+ if (mtstate -> mt_transition_capture )
1195+ {
1196+ mtstate -> mt_transition_capture -> tcs_original_insert_tuple = NULL ;
1197+ mtstate -> mt_transition_capture -> tcs_map = saved_tcs_map ;
1198+ }
1199+
1200+ /* We're done moving. */
1201+ return true;
1202+ }
1203+
10621204/* ----------------------------------------------------------------
10631205 * ExecUpdate
10641206 *
@@ -1212,119 +1354,28 @@ lreplace:;
12121354 */
12131355 if (partition_constraint_failed )
12141356 {
1215- bool tuple_deleted ;
1216- TupleTableSlot * ret_slot ;
1217- TupleTableSlot * epqslot = NULL ;
1218- PartitionTupleRouting * proute = mtstate -> mt_partition_tuple_routing ;
1219- int map_index ;
1220- TupleConversionMap * tupconv_map ;
1221- TupleConversionMap * saved_tcs_map = NULL ;
1222-
1223- /*
1224- * Disallow an INSERT ON CONFLICT DO UPDATE that causes the
1225- * original row to migrate to a different partition. Maybe this
1226- * can be implemented some day, but it seems a fringe feature with
1227- * little redeeming value.
1228- */
1229- if (((ModifyTable * ) mtstate -> ps .plan )-> onConflictAction == ONCONFLICT_UPDATE )
1230- ereport (ERROR ,
1231- (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
1232- errmsg ("invalid ON UPDATE specification" ),
1233- errdetail ("The result tuple would appear in a different partition than the original tuple." )));
1234-
1235- /*
1236- * When an UPDATE is run on a leaf partition, we will not have
1237- * partition tuple routing set up. In that case, fail with
1238- * partition constraint violation error.
1239- */
1240- if (proute == NULL )
1241- ExecPartitionCheckEmitError (resultRelInfo , slot , estate );
1242-
1243- /*
1244- * Row movement, part 1. Delete the tuple, but skip RETURNING
1245- * processing. We want to return rows from INSERT.
1246- */
1247- ExecDelete (mtstate , resultRelInfo , tupleid , oldtuple , planSlot ,
1248- epqstate , estate ,
1249- false, /* processReturning */
1250- false, /* canSetTag */
1251- true, /* changingPart */
1252- & tuple_deleted , & epqslot );
1253-
1254- /*
1255- * For some reason if DELETE didn't happen (e.g. trigger prevented
1256- * it, or it was already deleted by self, or it was concurrently
1257- * deleted by another transaction), then we should skip the insert
1258- * as well; otherwise, an UPDATE could cause an increase in the
1259- * total number of rows across all partitions, which is clearly
1260- * wrong.
1261- *
1262- * For a normal UPDATE, the case where the tuple has been the
1263- * subject of a concurrent UPDATE or DELETE would be handled by
1264- * the EvalPlanQual machinery, but for an UPDATE that we've
1265- * translated into a DELETE from this partition and an INSERT into
1266- * some other partition, that's not available, because CTID chains
1267- * can't span relation boundaries. We mimic the semantics to a
1268- * limited extent by skipping the INSERT if the DELETE fails to
1269- * find a tuple. This ensures that two concurrent attempts to
1270- * UPDATE the same tuple at the same time can't turn one tuple
1271- * into two, and that an UPDATE of a just-deleted tuple can't
1272- * resurrect it.
1273- */
1274- if (!tuple_deleted )
1275- {
1276- /*
1277- * epqslot will be typically NULL. But when ExecDelete()
1278- * finds that another transaction has concurrently updated the
1279- * same row, it re-fetches the row, skips the delete, and
1280- * epqslot is set to the re-fetched tuple slot. In that case,
1281- * we need to do all the checks again.
1282- */
1283- if (TupIsNull (epqslot ))
1284- return NULL ;
1285- else
1286- {
1287- slot = ExecFilterJunk (resultRelInfo -> ri_junkFilter , epqslot );
1288- goto lreplace ;
1289- }
1290- }
1357+ TupleTableSlot * inserted_tuple ,
1358+ * retry_slot ;
1359+ bool retry ;
12911360
12921361 /*
1293- * resultRelInfo is one of the per-subplan resultRelInfos. So we
1294- * should convert the tuple into root's tuple descriptor, since
1295- * ExecInsert() starts the search from root. The tuple conversion
1296- * map list is in the order of mtstate->resultRelInfo[], so to
1297- * retrieve the one for this resultRel, we need to know the
1298- * position of the resultRel in mtstate->resultRelInfo[].
1362+ * ExecCrossPartitionUpdate will first DELETE the row from the
1363+ * partition it's currently in and then insert it back into the
1364+ * root table, which will re-route it to the correct partition.
1365+ * The first part may have to be repeated if it is detected that
1366+ * the tuple we're trying to move has been concurrently updated.
12991367 */
1300- map_index = resultRelInfo - mtstate -> resultRelInfo ;
1301- Assert (map_index >= 0 && map_index < mtstate -> mt_nplans );
1302- tupconv_map = tupconv_map_for_subplan (mtstate , map_index );
1303- if (tupconv_map != NULL )
1304- slot = execute_attr_map_slot (tupconv_map -> attrMap ,
1305- slot ,
1306- mtstate -> mt_root_tuple_slot );
1307-
1308- /*
1309- * ExecInsert() may scribble on mtstate->mt_transition_capture, so
1310- * save the currently active map.
1311- */
1312- if (mtstate -> mt_transition_capture )
1313- saved_tcs_map = mtstate -> mt_transition_capture -> tcs_map ;
1314-
1315- /* Tuple routing starts from the root table. */
1316- Assert (mtstate -> rootResultRelInfo != NULL );
1317- ret_slot = ExecInsert (mtstate , mtstate -> rootResultRelInfo , slot ,
1318- planSlot , estate , canSetTag );
1319-
1320- /* Clear the INSERT's tuple and restore the saved map. */
1321- if (mtstate -> mt_transition_capture )
1368+ retry = !ExecCrossPartitionUpdate (mtstate , resultRelInfo , tupleid ,
1369+ oldtuple , slot , planSlot ,
1370+ epqstate , canSetTag ,
1371+ & retry_slot , & inserted_tuple );
1372+ if (retry )
13221373 {
1323- mtstate -> mt_transition_capture -> tcs_original_insert_tuple = NULL ;
1324- mtstate -> mt_transition_capture -> tcs_map = saved_tcs_map ;
1374+ slot = retry_slot ;
1375+ goto lreplace ;
13251376 }
13261377
1327- return ret_slot ;
1378+ return inserted_tuple ;
13281379 }
13291380
13301381 /*
0 commit comments