@@ -305,7 +305,10 @@ static void walkdir(const char *path,
305305#ifdef PG_FLUSH_DATA_WORKS
306306static void pre_sync_fname (const char * fname , bool isdir , int elevel );
307307#endif
308- static void fsync_fname_ext (const char * fname , bool isdir , int elevel );
308+ static void datadir_fsync_fname (const char * fname , bool isdir , int elevel );
309+
310+ static int fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel );
311+ static int fsync_parent_path (const char * fname , int elevel );
309312
310313
311314/*
@@ -412,54 +415,158 @@ pg_flush_data(int fd, off_t offset, off_t amount)
412415 * indicate the OS just doesn't allow/require fsyncing directories.
413416 */
414417void
415- fsync_fname (char * fname , bool isdir )
418+ fsync_fname (const char * fname , bool isdir )
419+ {
420+ fsync_fname_ext (fname , isdir , false, ERROR );
421+ }
422+
423+ /*
424+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
425+ *
426+ * This routine ensures that, after returning, the effect of renaming file
427+ * persists in case of a crash. A crash while this routine is running will
428+ * leave you with either the pre-existing or the moved file in place of the
429+ * new file; no mixed state or truncated files are possible.
430+ *
431+ * It does so by using fsync on the old filename and the possibly existing
432+ * target filename before the rename, and the target file and directory after.
433+ *
434+ * Note that rename() cannot be used across arbitrary directories, as they
435+ * might not be on the same filesystem. Therefore this routine does not
436+ * support renaming across directories.
437+ *
438+ * Log errors with the caller specified severity.
439+ *
440+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
441+ * valid upon return.
442+ */
443+ int
444+ durable_rename (const char * oldfile , const char * newfile , int elevel )
416445{
417446 int fd ;
418- int returncode ;
419447
420448 /*
421- * Some OSs require directories to be opened read-only whereas other
422- * systems don't allow us to fsync files opened read-only; so we need both
423- * cases here
449+ * First fsync the old and target path (if it exists), to ensure that they
450+ * are properly persistent on disk. Syncing the target file is not
451+ * strictly necessary, but it makes it easier to reason about crashes;
452+ * because it's then guaranteed that either source or target file exists
453+ * after a crash.
424454 */
425- if (!isdir )
426- fd = OpenTransientFile (fname ,
427- O_RDWR | PG_BINARY ,
428- S_IRUSR | S_IWUSR );
455+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
456+ return -1 ;
457+
458+ fd = OpenTransientFile ((char * ) newfile , PG_BINARY | O_RDWR , 0 );
459+ if (fd < 0 )
460+ {
461+ if (errno != ENOENT )
462+ {
463+ ereport (elevel ,
464+ (errcode_for_file_access (),
465+ errmsg ("could not open file \"%s\": %m" , newfile )));
466+ return -1 ;
467+ }
468+ }
429469 else
430- fd = OpenTransientFile (fname ,
431- O_RDONLY | PG_BINARY ,
432- S_IRUSR | S_IWUSR );
470+ {
471+ if (pg_fsync (fd ) != 0 )
472+ {
473+ int save_errno ;
474+
475+ /* close file upon error, might not be in transaction context */
476+ save_errno = errno ;
477+ CloseTransientFile (fd );
478+ errno = save_errno ;
479+
480+ ereport (elevel ,
481+ (errcode_for_file_access (),
482+ errmsg ("could not fsync file \"%s\": %m" , newfile )));
483+ return -1 ;
484+ }
485+ CloseTransientFile (fd );
486+ }
487+
488+ /* Time to do the real deal... */
489+ if (rename (oldfile , newfile ) < 0 )
490+ {
491+ ereport (elevel ,
492+ (errcode_for_file_access (),
493+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
494+ oldfile , newfile )));
495+ return -1 ;
496+ }
433497
434498 /*
435- * Some OSs don't allow us to open directories at all (Windows returns
436- * EACCES)
499+ * To guarantee renaming the file is persistent, fsync the file with its
500+ * new name, and its containing directory.
437501 */
438- if (fd < 0 && isdir && ( errno == EISDIR || errno == EACCES ) )
439- return ;
502+ if (fsync_fname_ext ( newfile , false, false, elevel ) != 0 )
503+ return -1 ;
440504
441- else if (fd < 0 )
442- ereport (ERROR ,
443- (errcode_for_file_access (),
444- errmsg ("could not open file \"%s\": %m" , fname )));
505+ if (fsync_parent_path (newfile , elevel ) != 0 )
506+ return -1 ;
445507
446- returncode = pg_fsync (fd );
508+ return 0 ;
509+ }
510+
511+ /*
512+ * durable_link_or_rename -- rename a file in a durable manner.
513+ *
514+ * Similar to durable_rename(), except that this routine tries (but does not
515+ * guarantee) not to overwrite the target file.
516+ *
517+ * Note that a crash in an unfortunate moment can leave you with two links to
518+ * the target file.
519+ *
520+ * Log errors with the caller specified severity.
521+ *
522+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
523+ * valid upon return.
524+ */
525+ int
526+ durable_link_or_rename (const char * oldfile , const char * newfile , int elevel )
527+ {
528+ /*
529+ * Ensure that, if we crash directly after the rename/link, a file with
530+ * valid contents is moved into place.
531+ */
532+ if (fsync_fname_ext (oldfile , false, false, elevel ) != 0 )
533+ return -1 ;
447534
448- /* Some OSs don't allow us to fsync directories at all */
449- if (returncode != 0 && isdir && errno == EBADF )
535+ #if HAVE_WORKING_LINK
536+ if (link ( oldfile , newfile ) < 0 )
450537 {
451- CloseTransientFile (fd );
452- return ;
538+ ereport (elevel ,
539+ (errcode_for_file_access (),
540+ errmsg ("could not link file \"%s\" to \"%s\": %m" ,
541+ oldfile , newfile )));
542+ return -1 ;
453543 }
454-
455- if (returncode != 0 )
456- ereport (ERROR ,
544+ unlink (oldfile );
545+ #else
546+ /* XXX: Add racy file existence check? */
547+ if (rename (oldfile , newfile ) < 0 )
548+ {
549+ ereport (elevel ,
457550 (errcode_for_file_access (),
458- errmsg ("could not fsync file \"%s\": %m" , fname )));
551+ errmsg ("could not rename file \"%s\" to \"%s\": %m" ,
552+ oldfile , newfile )));
553+ return -1 ;
554+ }
555+ #endif
459556
460- CloseTransientFile (fd );
461- }
557+ /*
558+ * Make change persistent in case of an OS crash, both the new entry and
559+ * its parent directory need to be flushed.
560+ */
561+ if (fsync_fname_ext (newfile , false, false, elevel ) != 0 )
562+ return -1 ;
563+
564+ /* Same for parent directory */
565+ if (fsync_parent_path (newfile , elevel ) != 0 )
566+ return -1 ;
462567
568+ return 0 ;
569+ }
463570
464571/*
465572 * InitFileAccess --- initialize this module during backend startup
@@ -2546,10 +2653,10 @@ SyncDataDirectory(void)
25462653 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
25472654 * so we don't worry about optimizing it.
25482655 */
2549- walkdir ("." , fsync_fname_ext , false, LOG );
2656+ walkdir ("." , datadir_fsync_fname , false, LOG );
25502657 if (xlog_is_symlink )
2551- walkdir ("pg_xlog" , fsync_fname_ext , false, LOG );
2552- walkdir ("pg_tblspc" , fsync_fname_ext , true, LOG );
2658+ walkdir ("pg_xlog" , datadir_fsync_fname , false, LOG );
2659+ walkdir ("pg_tblspc" , datadir_fsync_fname , true, LOG );
25532660}
25542661
25552662/*
@@ -2663,15 +2770,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
26632770
26642771#endif /* PG_FLUSH_DATA_WORKS */
26652772
2773+ static void
2774+ datadir_fsync_fname (const char * fname , bool isdir , int elevel )
2775+ {
2776+ /*
2777+ * We want to silently ignoring errors about unreadable files. Pass that
2778+ * desire on to fsync_fname_ext().
2779+ */
2780+ fsync_fname_ext (fname , isdir , true, elevel );
2781+ }
2782+
26662783/*
26672784 * fsync_fname_ext -- Try to fsync a file or directory
26682785 *
2669- * Ignores errors trying to open unreadable files, or trying to fsync
2670- * directories on systems where that isn't allowed/required, and logs other
2671- * errors at a caller-specified level.
2786+ * If ignore_perm is true, ignore errors upon trying to open unreadable
2787+ * files. Logs other errors at a caller-specified level.
2788+ *
2789+ * Returns 0 if the operation succeeded, -1 otherwise.
26722790 */
2673- static void
2674- fsync_fname_ext (const char * fname , bool isdir , int elevel )
2791+ static int
2792+ fsync_fname_ext (const char * fname , bool isdir , bool ignore_perm , int elevel )
26752793{
26762794 int fd ;
26772795 int flags ;
@@ -2689,20 +2807,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
26892807 else
26902808 flags |= O_RDONLY ;
26912809
2810+ fd = OpenTransientFile ((char * ) fname , flags , 0 );
2811+
26922812 /*
2693- * Open the file, silently ignoring errors about unreadable files (or
2694- * unsupported operations, e.g. opening a directory under Windows), and
2695- * logging others.
2813+ * Some OSs don't allow us to open directories at all (Windows returns
2814+ * EACCES), just ignore the error in that case. If desired also silently
2815+ * ignoring errors about unreadable files. Log others.
26962816 */
2697- fd = OpenTransientFile ((char * ) fname , flags , 0 );
2698- if (fd < 0 )
2817+ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES ))
2818+ return 0 ;
2819+ else if (fd < 0 && ignore_perm && errno == EACCES )
2820+ return 0 ;
2821+ else if (fd < 0 )
26992822 {
2700- if (errno == EACCES || (isdir && errno == EISDIR ))
2701- return ;
27022823 ereport (elevel ,
27032824 (errcode_for_file_access (),
27042825 errmsg ("could not open file \"%s\": %m" , fname )));
2705- return ;
2826+ return -1 ;
27062827 }
27072828
27082829 returncode = pg_fsync (fd );
@@ -2712,9 +2833,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
27122833 * those errors. Anything else needs to be logged.
27132834 */
27142835 if (returncode != 0 && !(isdir && errno == EBADF ))
2836+ {
2837+ int save_errno ;
2838+
2839+ /* close file upon error, might not be in transaction context */
2840+ save_errno = errno ;
2841+ (void ) CloseTransientFile (fd );
2842+ errno = save_errno ;
2843+
27152844 ereport (elevel ,
27162845 (errcode_for_file_access (),
27172846 errmsg ("could not fsync file \"%s\": %m" , fname )));
2847+ return -1 ;
2848+ }
27182849
27192850 (void ) CloseTransientFile (fd );
2851+
2852+ return 0 ;
2853+ }
2854+
2855+ /*
2856+ * fsync_parent_path -- fsync the parent path of a file or directory
2857+ *
2858+ * This is aimed at making file operations persistent on disk in case of
2859+ * an OS crash or power failure.
2860+ */
2861+ static int
2862+ fsync_parent_path (const char * fname , int elevel )
2863+ {
2864+ char parentpath [MAXPGPATH ];
2865+
2866+ strlcpy (parentpath , fname , MAXPGPATH );
2867+ get_parent_directory (parentpath );
2868+
2869+ /*
2870+ * get_parent_directory() returns an empty string if the input argument is
2871+ * just a file name (see comments in path.c), so handle that as being the
2872+ * current directory.
2873+ */
2874+ if (strlen (parentpath ) == 0 )
2875+ strlcpy (parentpath , "." , MAXPGPATH );
2876+
2877+ if (fsync_fname_ext (parentpath , true, false, elevel ) != 0 )
2878+ return -1 ;
2879+
2880+ return 0 ;
27202881}
0 commit comments