ITPub博客

首页 > 数据库 > PostgreSQL > PostgreSQL 源码解读(131)- MVCC#15(vacuum过程-lazy_vacuum_heap函数)

PostgreSQL 源码解读(131)- MVCC#15(vacuum过程-lazy_vacuum_heap函数)

原创 PostgreSQL 作者:husthxd 时间:2019-01-29 15:06:05 0 删除 编辑

本节简单介绍了PostgreSQL手工执行vacuum的处理流程,主要分析了ExecVacuum->vacuum->vacuum_rel->heap_vacuum_rel->lazy_scan_heap->lazy_vacuum_heap函数的实现逻辑,该函数访问堆表,标记废弃元组为未使用并在这些元组所在页面上压缩空闲空间。

一、数据结构

宏定义
Vacuum和Analyze命令选项


/* ----------------------
 *      Vacuum and Analyze Statements
 *      Vacuum和Analyze命令选项
 * 
 * Even though these are nominally two statements, it's convenient to use
 * just one node type for both.  Note that at least one of VACOPT_VACUUM
 * and VACOPT_ANALYZE must be set in options.
 * 虽然在这里有两种不同的语句,但只需要使用统一的Node类型即可.
 * 注意至少VACOPT_VACUUM/VACOPT_ANALYZE在选项中设置.
 * ----------------------
 */
typedef enum VacuumOption
{
    VACOPT_VACUUM = 1 << 0,     /* do VACUUM */
    VACOPT_ANALYZE = 1 << 1,    /* do ANALYZE */
    VACOPT_VERBOSE = 1 << 2,    /* print progress info */
    VACOPT_FREEZE = 1 << 3,     /* FREEZE option */
    VACOPT_FULL = 1 << 4,       /* FULL (non-concurrent) vacuum */
    VACOPT_SKIP_LOCKED = 1 << 5,    /* skip if cannot get lock */
    VACOPT_SKIPTOAST = 1 << 6,  /* don't process the TOAST table, if any */
    VACOPT_DISABLE_PAGE_SKIPPING = 1 << 7   /* don't skip any pages */
} VacuumOption;

itemIdSort
PageRepairFragmentation/PageIndexMultiDelete的排序支持


/*
 * sorting support for PageRepairFragmentation and PageIndexMultiDelete
 * PageRepairFragmentation/PageIndexMultiDelete的排序支持
 */
typedef struct itemIdSortData
{
    //行指针数组索引
    uint16      offsetindex;    /* linp array index */
    //item数据页内偏移
    int16       itemoff;        /* page offset of item data */
    //对齐长度
    uint16      alignedlen;     /* MAXALIGN(item data len) */
} itemIdSortData;
//结构体指针
typedef itemIdSortData *itemIdSort;

LVRelStats


typedef struct LVRelStats
{
    /* hasindex = true means two-pass strategy; false means one-pass */
    //T表示two-pass strategy,F表示one-pass strategy
    bool        hasindex;
    /* Overall statistics about rel */
    //rel的全局统计信息
    //pg_class.relpages的上一个值
    BlockNumber old_rel_pages;  /* previous value of pg_class.relpages */
    //pages的总数
    BlockNumber rel_pages;      /* total number of pages */
    //扫描的pages
    BlockNumber scanned_pages;  /* number of pages we examined */
    //由于pin跳过的pages
    BlockNumber pinskipped_pages;   /* # of pages we skipped due to a pin */
    //跳过的frozen pages
    BlockNumber frozenskipped_pages;    /* # of frozen pages we skipped */
    //计算其元组的pages
    BlockNumber tupcount_pages; /* pages whose tuples we counted */
    //pg_class.reltuples的前值
    double      old_live_tuples;    /* previous value of pg_class.reltuples */
    //新估算的总元组数
    double      new_rel_tuples; /* new estimated total # of tuples */
    //新估算的存活元组数
    double      new_live_tuples;    /* new estimated total # of live tuples */
    //新估算的废弃元组数
    double      new_dead_tuples;    /* new estimated total # of dead tuples */
    //已清除的pages
    BlockNumber pages_removed;
    //已删除的tuples
    double      tuples_deleted;
    //实际上是非空page + 1
    BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
    /* List of TIDs of tuples we intend to delete */
    /* NB: this list is ordered by TID address */
    //将要删除的元组TIDs链表
    //注意:该链表已使用TID地址排序
    //当前的入口/条目数
    int         num_dead_tuples;    /* current # of entries */
    //数组中已分配的slots(最大已废弃元组数)
    int         max_dead_tuples;    /* # slots allocated in array */
    //ItemPointer数组
    ItemPointer dead_tuples;    /* array of ItemPointerData */
    //扫描的索引数
    int         num_index_scans;
    //最后被清除的事务ID
    TransactionId latestRemovedXid;
    //是否存在waiter?
    bool        lock_waiter_detected;
} LVRelStats;

ItemPointer
行指针


typedef struct ItemPointerData
{
    BlockIdData ip_blkid;//块号
    OffsetNumber ip_posid;//块内偏移
}
typedef ItemPointerData *ItemPointer;

二、源码解读

lazy_vacuum_heap
lazy_vacuum_heap标记废弃元组为未使用并在这些元组所在页面上压缩空闲空间,在此期间,不会访问lazy_scan_heap标记为存活元组的页面.
主要处理流程如下:
1.初始化变量
2.遍历vacrelstats->num_dead_tuples行指针数组(ItemPointer)
2.1获取块号/读取块到缓冲区中
2.2加锁,如不成功,则处理下一个元组
2.3调用lazy_vacuum_page释放空间,整理碎片
2.4获取page,获取该page的空闲空间
2.5释放缓冲,记录空闲空间
3.收尾工作


/*
 *  lazy_vacuum_heap() -- second pass over the heap
 *  lazy_vacuum_heap() -- 二次访问堆表
 *
 *      This routine marks dead tuples as unused and compacts out free
 *      space on their pages.  Pages not having dead tuples recorded from
 *      lazy_scan_heap are not visited at all.
 *      lazy_vacuum_heap标记废弃元组为未使用并在这些元组所在页面上压缩空闲空间.
 *      在此期间,不会访问lazy_scan_heap标记没有废弃元组的页面.
 *
 * Note: the reason for doing this as a second pass is we cannot remove
 * the tuples until we've removed their index entries, and we want to
 * process index entry removal in batches as large as possible.
 * 注意:二次访问堆表的原因是在清除索引条目前不能清除元组,
 *      而且我们希望以批量的方式处理索引条目,越大越好.
 */
static void
lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
{
    int         tupindex;//元组索引
    int         npages;//页面数
    PGRUsage    ru0;
    Buffer      vmbuffer = InvalidBuffer;//vm缓冲
    pg_rusage_init(&ru0);//初始化
    npages = 0;
    tupindex = 0;
    //遍历废弃元组
    //vacrelstats->dead_tuples数组中的元素类型ItemPointer
    while (tupindex < vacrelstats->num_dead_tuples)
    {
        BlockNumber tblk;//块号
        Buffer      buf;//缓冲
        Page        page;//页面
        Size        freespace;
        vacuum_delay_point();
        //获取块号
        tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
        //以扩展方式读取buffer
        buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
                                 vac_strategy);
        //获取锁(不等待)
        if (!ConditionalLockBufferForCleanup(buf))
        {
            //获取不了,释放资源,跳转到下一个元组
            ReleaseBuffer(buf);
            ++tupindex;
            continue;
        }
        //释放page中的废弃元组,并整理碎片
        tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
                                    &vmbuffer);
        /* Now that we've compacted the page, record its available space */
        //现在已经压缩了页面(释放了空间),记录可用空间
        page = BufferGetPage(buf);
        freespace = PageGetHeapFreeSpace(page);
        UnlockReleaseBuffer(buf);
        RecordPageWithFreeSpace(onerel, tblk, freespace);
        npages++;
    }
    if (BufferIsValid(vmbuffer))
    {
        //释放缓冲区
        ReleaseBuffer(vmbuffer);
        vmbuffer = InvalidBuffer;
    }
    ereport(elevel,
            (errmsg("\"%s\": removed %d row versions in %d pages",
                    RelationGetRelationName(onerel),
                    tupindex, npages),
             errdetail_internal("%s", pg_rusage_show(&ru0))));
}

lazy_vacuum_page
lazy_vacuum_page释放page中的废弃元组,并整理碎片
主要处理逻辑如下:
1.初始化相关变量
2.遍历废弃元组数组
2.1获取块号,如块号不一致,跳出循环
2.2获取偏移/行指针
2.3标记为未使用,记录偏移
3.调用PageRepairFragmentation整理碎片
3.1判断和检查(严谨的编码!!!)
3.2获取偏移,初始化变量
3.3遍历行指针数组
3.3.1获取行指针lp
3.3.2如ItemId正在使用,记录到itemidbase数组中;否则标记ItemId未被使用
3.4计算数组中存储的元素个数
A.如个数为0,重置page
B.否则调用compactify_tuples压缩页
3.5为PageAddItem方法设置标记位
4.标记buffer为dirty
5.写入WAL Record
6.如all-visible,则设置页面all-visible标记
7.如page为all-visible,设置vm
8.返回下一个page的起始数组编号


/*
 *  lazy_vacuum_page() -- free dead tuples on a page
 *                   and repair its fragmentation.
 *  lazy_vacuum_page() -- 释放page中的废弃元组,并整理碎片
 *
 * Caller must hold pin and buffer cleanup lock on the buffer.
 * 调用者必须持有buffer的pin和cleanup锁才能执行
 *
 * tupindex is the index in vacrelstats->dead_tuples of the first dead
 * tuple for this page.  We assume the rest follow sequentially.
 * The return value is the first tupindex after the tuples of this page.
 * tupindex是该page中第一个废弃元组在vacrelstats->dead_tuples中的编号,我们假定余下元组是顺序的.
 * 返回值是该page中的元组后的第一个编号tupindex.
 */
static int
lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
                 int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
{
    //获取page
    Page        page = BufferGetPage(buffer);
    OffsetNumber unused[MaxOffsetNumber];//偏移数组
    int         uncnt = 0;
    TransactionId visibility_cutoff_xid;//事务ID
    bool        all_frozen;//释放全部冻结
    pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
    //进入关键处理部分
    START_CRIT_SECTION();
    //遍历废弃元组数组
    for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
    {
        BlockNumber tblk;//块号
        OffsetNumber toff;//偏移
        ItemId      itemid;//行指针
        //根据行指针获取块号
        tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
        if (tblk != blkno)
            //不是同一个块,跳出循环
            break;              /* past end of tuples for this block */
        //获取偏移
        toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
        //获取行指针
        itemid = PageGetItemId(page, toff);
        //标记为未使用
        ItemIdSetUnused(itemid);
        //记录偏移
        unused[uncnt++] = toff;
    }
    //整理碎片
    PageRepairFragmentation(page);
    /*
     * Mark buffer dirty before we write WAL.
     * 标记buffer为dirty
     */
    MarkBufferDirty(buffer);
    /* XLOG stuff */
    if (RelationNeedsWAL(onerel))
    {
        //记录WAL Record
        XLogRecPtr  recptr;
        recptr = log_heap_clean(onerel, buffer,
                                NULL, 0, NULL, 0,
                                unused, uncnt,
                                vacrelstats->latestRemovedXid);
        PageSetLSN(page, recptr);
    }
    /*
     * End critical section, so we safely can do visibility tests (which
     * possibly need to perform IO and allocate memory!). If we crash now the
     * page (including the corresponding vm bit) might not be marked all
     * visible, but that's fine. A later vacuum will fix that.
     * 结束关键区域,这样我们可以安全的执行可见性检查
     * (这可能需要执行IO/分配内存)
     * 如果进程崩溃,页面(包括相应的vm位)可能标记为all-visible,但这也没有问题,后续vacuum会修复.
     */
    END_CRIT_SECTION();
    /*
     * Now that we have removed the dead tuples from the page, once again
     * check if the page has become all-visible.  The page is already marked
     * dirty, exclusively locked, and, if needed, a full page image has been
     * emitted in the log_heap_clean() above.
     * 现在,我们已经从页面中删除了废弃的元组,再次检查页面是否已经全部可见。
     * 页面已经被标记为dirty、独占锁定,如需要,还会在log_heap_clean()中记录完整的页面镜像。
     */
    if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid,
                                 &all_frozen))
        PageSetAllVisible(page);
    /*
     * All the changes to the heap page have been done. If the all-visible
     * flag is now set, also set the VM all-visible bit (and, if possible, the
     * all-frozen bit) unless this has already been done previously.
     * 堆页面的所有修改已完成.如果设置了all-visible标记,同时设置VM all-visible位
     * (而且,如可能,设置all-frozen位),除非先前已完成.
     */
    if (PageIsAllVisible(page))
    {
        uint8       vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer);
        uint8       flags = 0;
        /* Set the VM all-frozen bit to flag, if needed */
        //如需要,设置VM all-frozen标记位
        if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
            flags |= VISIBILITYMAP_ALL_VISIBLE;
        if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
            flags |= VISIBILITYMAP_ALL_FROZEN;
        Assert(BufferIsValid(*vmbuffer));
        if (flags != 0)
            visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr,
                              *vmbuffer, visibility_cutoff_xid, flags);
    }
    return tupindex;
}
/*
 * PageRepairFragmentation
 *
 * Frees fragmented space on a page.
 * 释放页面上的碎片空间.
 *
 * It doesn't remove unused line pointers! Please don't change this.
 * 该方法不会清楚未使用的行指针!因此,不要修改它.
 *
 * This routine is usable for heap pages only, but see PageIndexMultiDelete.
 * 该方法只用于堆页面,但注意参考PageIndexMultiDelete.
 *
 * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated.
 * 该方法在处理的时候,页面的PD_HAS_FREE_LINES标记位会被更新.
 * 
 */
void
PageRepairFragmentation(Page page)
{
    Offset      pd_lower = ((PageHeader) page)->pd_lower;
    Offset      pd_upper = ((PageHeader) page)->pd_upper;
    Offset      pd_special = ((PageHeader) page)->pd_special;
    itemIdSortData itemidbase[MaxHeapTuplesPerPage];//存储数据
    itemIdSort  itemidptr;
    ItemId      lp;
    int         nline,
                nstorage,
                nunused;
    int         i;
    Size        totallen;
    /*
     * It's worth the trouble to be more paranoid here than in most places,
     * because we are about to reshuffle data in (what is usually) a shared
     * disk buffer.  If we aren't careful then corrupted pointers, lengths,
     * etc could cause us to clobber adjacent disk buffers, spreading the data
     * loss further.  So, check everything.
     * 在这里比在其他地方执行更多的检查是值得的,因为我们将在(通常是)共享磁盘缓冲区中重新洗牌数据。
     * 如果我们不小心,那么损坏的行指针、数据长度等可能会导致与相邻磁盘缓冲区冲突,
     *   如果错误进一步传播会导致数据丢失。因此,需要仔细检查。
     */
    if (pd_lower < SizeOfPageHeaderData ||
        pd_lower > pd_upper ||
        pd_upper > pd_special ||
        pd_special > BLCKSZ ||
        pd_special != MAXALIGN(pd_special))
        ereport(ERROR,
                (errcode(ERRCODE_DATA_CORRUPTED),
                 errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
                        pd_lower, pd_upper, pd_special)));
    /*
     * Run through the line pointer array and collect data about live items.
     * 遍历行指针数组,收集存活的条目.
     */
    nline = PageGetMaxOffsetNumber(page);//获取最大的偏移
    itemidptr = itemidbase;//
    nunused = totallen = 0;
    for (i = FirstOffsetNumber; i <= nline; i++)
    {
        //---------- 遍历行指针数组
        //获取line pointer
        lp = PageGetItemId(page, i);
        if (ItemIdIsUsed(lp))
        {
            //如果ItemId在使用 
            if (ItemIdHasStorage(lp))
            {
                //如ItemID与存储相关,判断条件:((itemId)->lp_len != 0)
                itemidptr->offsetindex = i - 1;
                itemidptr->itemoff = ItemIdGetOffset(lp);
                //执行判断
                if (unlikely(itemidptr->itemoff < (int) pd_upper ||
                             itemidptr->itemoff >= (int) pd_special))
                    ereport(ERROR,
                            (errcode(ERRCODE_DATA_CORRUPTED),
                             errmsg("corrupted item pointer: %u",
                                    itemidptr->itemoff)));
                //对齐长度
                itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
                totallen += itemidptr->alignedlen;
                itemidptr++;//数组下一个元素
            }
        }
        else
        {
            /* Unused entries should have lp_len = 0, but make sure */
            //未使用的ItemId
            ItemIdSetUnused(lp);
            nunused++;
        }
    }
    //数组中存储的元素个数
    nstorage = itemidptr - itemidbase;
    if (nstorage == 0)
    {
        /* Page is completely empty, so just reset it quickly */
        //page完全是空的,重置page
        ((PageHeader) page)->pd_upper = pd_special;
    }
    else
    {
        /* Need to compact the page the hard way */
        //page非空,压缩页
        if (totallen > (Size) (pd_special - pd_lower))
            ereport(ERROR,
                    (errcode(ERRCODE_DATA_CORRUPTED),
                     errmsg("corrupted item lengths: total %u, available space %u",
                            (unsigned int) totallen, pd_special - pd_lower)));
        compactify_tuples(itemidbase, nstorage, page);
    }
    /* Set hint bit for PageAddItem */
    //为PageAddItem方法设置标记位
    if (nunused > 0)
        //存在未使用的空位,设置标记
        PageSetHasFreeLinePointers(page);
    else
        //清除标记
        PageClearHasFreeLinePointers(page);
}
/*
 * After removing or marking some line pointers unused, move the tuples to
 * remove the gaps caused by the removed items.
 * 在清除或者标记某些行指针为没有使用后,移动元组以消除已删除元组之间的鸿沟
 */
static void
compactify_tuples(itemIdSort itemidbase, int nitems, Page page)
{
    PageHeader  phdr = (PageHeader) page;
    Offset      upper;
    int         i;
    /* sort itemIdSortData array into decreasing itemoff order */
    //以itemoff降序的方式排序itemIdSortData数组
    qsort((char *) itemidbase, nitems, sizeof(itemIdSortData),
          itemoffcompare);
    //重整page
    upper = phdr->pd_special;
    for (i = 0; i < nitems; i++)
    {
        itemIdSort  itemidptr = &itemidbase[i];
        ItemId      lp;
        lp = PageGetItemId(page, itemidptr->offsetindex + 1);
        upper -= itemidptr->alignedlen;
        memmove((char *) page + upper,
                (char *) page + itemidptr->itemoff,
                itemidptr->alignedlen);
        lp->lp_off = upper;
    }
    phdr->pd_upper = upper;
}
/*
 * ItemIdSetUnused
 *      Set the item identifier to be UNUSED, with no storage.
 *      Beware of multiple evaluations of itemId!
 *      设置ItemId为未使用.
 */
#define ItemIdSetUnused(itemId) \
( \
    (itemId)->lp_flags = LP_UNUSED, \
    (itemId)->lp_off = 0, \
    (itemId)->lp_len = 0 \
)

三、跟踪分析

测试脚本 : 删除数据,执行vacuum


11:04:59 (xdb@[local]:5432)testdb=# delete from t1 where id < 600;
DELETE 100
14:26:16 (xdb@[local]:5432)testdb=# checkpoint;
CHECKPOINT
11:18:29 (xdb@[local]:5432)testdb=# vacuum verbose t1;

lazy_vacuum_heap
启动gdb,设置断点


(gdb) b lazy_vacuum_heap
Breakpoint 7 at 0x6bdf2e: file vacuumlazy.c, line 1472.
(gdb) c
Continuing.
Breakpoint 7, lazy_vacuum_heap (onerel=0x7f4c70d96688, vacrelstats=0x1873928) at vacuumlazy.c:1472
1472        Buffer      vmbuffer = InvalidBuffer;
(gdb)

输入参数
1-relation


(gdb) p *onerel
$14 = {rd_node = {spcNode = 1663, dbNode = 16402, relNode = 50820}, rd_smgr = 0x18362e0, rd_refcnt = 1, rd_backend = -1, 
  rd_islocaltemp = false, rd_isnailed = false, rd_isvalid = true, rd_indexvalid = 1 '\001', rd_statvalid = false, 
  rd_createSubid = 0, rd_newRelfilenodeSubid = 0, rd_rel = 0x7f4c70d95bb8, rd_att = 0x7f4c70d95cd0, rd_id = 50820, 
  rd_lockInfo = {lockRelId = {relId = 50820, dbId = 16402}}, rd_rules = 0x0, rd_rulescxt = 0x0, trigdesc = 0x0, 
  rd_rsdesc = 0x0, rd_fkeylist = 0x0, rd_fkeyvalid = false, rd_partkeycxt = 0x0, rd_partkey = 0x0, rd_pdcxt = 0x0, 
  rd_partdesc = 0x0, rd_partcheck = 0x0, rd_indexlist = 0x7f4c70d94820, rd_oidindex = 0, rd_pkindex = 0, 
  rd_replidindex = 0, rd_statlist = 0x0, rd_indexattr = 0x0, rd_projindexattr = 0x0, rd_keyattr = 0x0, rd_pkattr = 0x0, 
  rd_idattr = 0x0, rd_projidx = 0x0, rd_pubactions = 0x0, rd_options = 0x0, rd_index = 0x0, rd_indextuple = 0x0, 
  rd_amhandler = 0, rd_indexcxt = 0x0, rd_amroutine = 0x0, rd_opfamily = 0x0, rd_opcintype = 0x0, rd_support = 0x0, 
  rd_supportinfo = 0x0, rd_indoption = 0x0, rd_indexprs = 0x0, rd_indpred = 0x0, rd_exclops = 0x0, rd_exclprocs = 0x0, 
  rd_exclstrats = 0x0, rd_amcache = 0x0, rd_indcollation = 0x0, rd_fdwroutine = 0x0, rd_toastoid = 0, 
  pgstat_info = 0x182a030}

2-vacrelstats
存在索引,pages总数为124,扫描pages为124,原存活tuple为9501,新tuples为9401,已删除tuples为100,已删除的tuples的ItemPointer存储在dead_tuples数组中(大小为num_dead_tuples)


(gdb) p *vacrelstats
$15 = {hasindex = true, old_rel_pages = 124, rel_pages = 124, scanned_pages = 124, pinskipped_pages = 0, 
  frozenskipped_pages = 0, tupcount_pages = 124, old_live_tuples = 9501, new_rel_tuples = 9401, new_live_tuples = 9401, 
  new_dead_tuples = 0, pages_removed = 0, tuples_deleted = 100, nonempty_pages = 124, num_dead_tuples = 100, 
  max_dead_tuples = 36084, dead_tuples = 0x1884820, num_index_scans = 0, latestRemovedXid = 397073, 
  lock_waiter_detected = false}
(gdb)

1.初始化变量


(gdb) n
1474        pg_rusage_init(&ru0);
(gdb) 
1475        npages = 0;
(gdb) 
1477        tupindex = 0;
(gdb) p ru0
$16 = {tv = {tv_sec = 1548743482, tv_usec = 626506}, ru = {ru_utime = {tv_sec = 0, tv_usec = 40060}, ru_stime = {
      tv_sec = 0, tv_usec = 114769}, {ru_maxrss = 8900, __ru_maxrss_word = 8900}, {ru_ixrss = 0, __ru_ixrss_word = 0}, {
      ru_idrss = 0, __ru_idrss_word = 0}, {ru_isrss = 0, __ru_isrss_word = 0}, {ru_minflt = 5455, __ru_minflt_word = 5455}, 
    {ru_majflt = 0, __ru_majflt_word = 0}, {ru_nswap = 0, __ru_nswap_word = 0}, {ru_inblock = 2616, 
      __ru_inblock_word = 2616}, {ru_oublock = 376, __ru_oublock_word = 376}, {ru_msgsnd = 0, __ru_msgsnd_word = 0}, {
      ru_msgrcv = 0, __ru_msgrcv_word = 0}, {ru_nsignals = 0, __ru_nsignals_word = 0}, {ru_nvcsw = 814, 
      __ru_nvcsw_word = 814}, {ru_nivcsw = 2, __ru_nivcsw_word = 2}}}

2.遍历vacrelstats->num_dead_tuples行指针数组(ItemPointer)


(gdb) n
1478        while (tupindex < vacrelstats->num_dead_tuples)
(gdb)

2.1获取块号/读取块到缓冲区中


1485            vacuum_delay_point();
(gdb) 
1487            tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
(gdb) 
1488            buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
(gdb) 
(gdb) p tblk
$17 = 29
(gdb) p buf
$18 = 175

2.2加锁,如不成功,则处理下一个元组


1490            if (!ConditionalLockBufferForCleanup(buf))
(gdb)

2.3调用lazy_vacuum_page释放空间,整理碎片


1496            tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
(gdb) p tupindex
$1 = 0
(gdb) n
1500            page = BufferGetPage(buf);
(gdb) p tupindex
$2 = 2
(gdb)

2.4获取page,获取该page的空闲空间


(gdb) n
1500            page = BufferGetPage(buf);
(gdb) p tupindex
$2 = 2
(gdb) n
1501            freespace = PageGetHeapFreeSpace(page);
(gdb)

2.5释放缓冲,记录空闲空间


(gdb) 
1503            UnlockReleaseBuffer(buf);
(gdb) 
1504            RecordPageWithFreeSpace(onerel, tblk, freespace);
(gdb) 
1505            npages++;
(gdb)

lazy_vacuum_page
进入lazy_vacuum_page函数


1496            tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
(gdb) p tblk
$3 = 30
(gdb) p buf
$4 = 178
(gdb) p tupindex
$5 = 2
(gdb) 
(gdb) step
lazy_vacuum_page (onerel=0x7f4c70d95570, blkno=30, buffer=178, tupindex=2, vacrelstats=0x18676a8, vmbuffer=0x7fffaef4a19c)
    at vacuumlazy.c:1535
1535        Page        page = BufferGetPage(buffer);
(gdb)

输入参数:块号/缓冲区编号/tuple数组下标以及vacrelstats(统计信息+辅助存储信息,如废弃元组数组等)


(gdb) p vacrelstats->dead_tuples[0]
$6 = {ip_blkid = {bi_hi = 0, bi_lo = 29}, ip_posid = 168}

1.初始化相关变量


(gdb) n
1537        int         uncnt = 0;
(gdb) 
1541        pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
(gdb) 
1543        START_CRIT_SECTION();
(gdb) 
1545        for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
(gdb) p page
$7 = (Page) 0x7f4c44f46380 "\001"
(gdb) p *page
$8 = 1 '\001'
(gdb) p *(PageHeader *)page
$9 = (PageHeader) 0x4ec2441800000001
(gdb) p *(PageHeader)page
$10 = {pd_lsn = {xlogid = 1, xrecoff = 1321354264}, pd_checksum = 0, pd_flags = 1, pd_lower = 1188, pd_upper = 7856, 
  pd_special = 8192, pd_pagesize_version = 8196, pd_prune_xid = 0, pd_linp = 0x7f4c44f46398}
(gdb)

2.遍历废弃元组数组
2.1获取块号,如块号不一致,跳出循环
2.2获取偏移/行指针
2.3标记为未使用,记录偏移


(gdb) n
1551            tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
(gdb) 
1552            if (tblk != blkno)
(gdb) p tblk
$11 = 30
(gdb) n
1554            toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
(gdb) p vacrelstats->dead_tuples[tupindex]
$12 = {ip_blkid = {bi_hi = 0, bi_lo = 30}, ip_posid = 162}
(gdb) n
1555            itemid = PageGetItemId(page, toff);
(gdb) p toff
$13 = 162
(gdb) n
1556            ItemIdSetUnused(itemid);
(gdb) p itemid
$14 = (ItemId) 0x7f4c44f4661c
(gdb) p *itemid
$15 = {lp_off = 0, lp_flags = 3, lp_len = 0}
(gdb) n
1557            unused[uncnt++] = toff;
(gdb) 
1545        for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
(gdb)

3.调用PageRepairFragmentation整理碎片
3.1判断和检查(严谨的编码!!!)


...
(gdb) b vacuumlazy.c:1560
Breakpoint 2 at 0x6be604: file vacuumlazy.c, line 1560.
(gdb) c
Continuing.
Breakpoint 2, lazy_vacuum_page (onerel=0x7f4c70d95570, blkno=30, buffer=178, tupindex=5, vacrelstats=0x18676a8, 
    vmbuffer=0x7fffaef4a19c) at vacuumlazy.c:1560
1560        PageRepairFragmentation(page);
(gdb) 
(gdb) step
PageRepairFragmentation (page=0x7f4c44f46380 "\001") at bufpage.c:481
481     Offset      pd_lower = ((PageHeader) page)->pd_lower;
(gdb) n
482     Offset      pd_upper = ((PageHeader) page)->pd_upper;
(gdb) 
483     Offset      pd_special = ((PageHeader) page)->pd_special;
(gdb) 
500     if (pd_lower < SizeOfPageHeaderData ||
(gdb) p pd_lower
$17 = 1188
(gdb) p pd_upper
$18 = 7856
(gdb) p pd_special
$19 = 8192
(gdb) n
501         pd_lower > pd_upper ||
(gdb) 
502         pd_upper > pd_special ||
(gdb) 
504         pd_special != MAXALIGN(pd_special))
(gdb) 
503         pd_special > BLCKSZ ||

3.2获取偏移,初始化变量


(gdb) 
513     nline = PageGetMaxOffsetNumber(page);
(gdb) n
514     itemidptr = itemidbase;
(gdb) 
515     nunused = totallen = 0;
(gdb) p nline
$20 = 291
(gdb) p *itemidptr
$21 = {offsetindex = 162, itemoff = 8144, alignedlen = 48}
(gdb)

3.3遍历行指针数组
3.3.1获取行指针lp
3.3.2如ItemId正在使用,记录到itemidbase数组中;否则标记ItemId未被使用


(gdb) 
516     for (i = FirstOffsetNumber; i <= nline; i++)
(gdb) n
519         if (ItemIdIsUsed(lp))
(gdb) 
539             ItemIdSetUnused(lp);
(gdb) 
540             nunused++;
(gdb) 
516     for (i = FirstOffsetNumber; i <= nline; i++)
(gdb)

跳出循环,继续执行


516     for (i = FirstOffsetNumber; i <= nline; i++)
(gdb) b bufpage.c:544
Breakpoint 3 at 0x8b1d2d: file bufpage.c, line 544.
(gdb) c
Continuing.
Breakpoint 3, PageRepairFragmentation (page=0x7f4c44f46380 "\001") at bufpage.c:544
544     nstorage = itemidptr - itemidbase;
(gdb) 
(gdb) p nunused
$22 = 284

3.4计算数组中存储的元素个数
A.如个数为0,重置page
B.否则调用compactify_tuples压缩页


(gdb) n
545     if (nstorage == 0)
(gdb) p nstorage
$23 = 7
(gdb) n
553         if (totallen > (Size) (pd_special - pd_lower))
(gdb) 
559         compactify_tuples(itemidbase, nstorage, page);
(gdb)

3.5为PageAddItem方法设置标记位


(gdb) 
563     if (nunused > 0)
(gdb) 
564         PageSetHasFreeLinePointers(page);
(gdb) 
567 }
(gdb)

4.标记buffer为dirty


(gdb) 
lazy_vacuum_page (onerel=0x7f4c70d95570, blkno=30, buffer=178, tupindex=5, vacrelstats=0x18676a8, vmbuffer=0x7fffaef4a19c)
    at vacuumlazy.c:1565
1565        MarkBufferDirty(buffer);
(gdb) n

5.写入WAL Record


1568        if (RelationNeedsWAL(onerel))
(gdb) 
1572            recptr = log_heap_clean(onerel, buffer,
(gdb) 
1576            PageSetLSN(page, recptr);
(gdb) 
1585        END_CRIT_SECTION();

6.如all-visible,则设置页面all-visible标记


(gdb) n
1593        if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid,
(gdb) 
1595            PageSetAllVisible(page);
(gdb)

7.如page为all-visible,设置vm


1602        if (PageIsAllVisible(page))
(gdb) 
1604            uint8       vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer);
(gdb) 
1605            uint8       flags = 0;
(gdb) 
1608            if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
(gdb) 
1609                flags |= VISIBILITYMAP_ALL_VISIBLE;
(gdb) 
1610            if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
(gdb) 
1613            Assert(BufferIsValid(*vmbuffer));
(gdb) 
1614            if (flags != 0)
(gdb) 
1615                visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr,
(gdb)

8.返回下一个page的起始数组编号


(gdb) 
1619        return tupindex;
(gdb) p tupindex
$24 = 5
(gdb)

DONE!

四、参考资料

PG Source Code

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/6906/viewspace-2565148/,如需转载,请注明出处,否则将追究法律责任。

请登录后发表评论 登录
全部评论
长期从事政务、金融等行业产品研发和架构设计工作,对Oracle、PostgreSQL以及大数据等相关技术有深入研究。现就职于广州云图数据技术有限公司,系统架构师。

注册时间:2007-12-28

  • 博文量
    1169
  • 访问量
    3634724