@@ -1957,14 +1957,18 @@ typedef struct
19571957
19581958/*
19591959 * TS_execute callback for matching a tsquery operand to headline words
1960+ *
1961+ * Note: it's tempting to report words[] indexes as pos values to save
1962+ * searching in hlCover; but that would screw up phrase matching, which
1963+ * expects to measure distances in lexemes not tokens.
19601964 */
19611965static TSTernaryValue
19621966checkcondition_HL (void * opaque , QueryOperand * val , ExecPhraseData * data )
19631967{
19641968 hlCheck * checkval = (hlCheck * ) opaque ;
19651969 int i ;
19661970
1967- /* scan words array for marching items */
1971+ /* scan words array for matching items */
19681972 for (i = 0 ; i < checkval -> len ; i ++ )
19691973 {
19701974 if (checkval -> words [i ].item == val )
@@ -1993,35 +1997,15 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
19931997 return TS_NO ;
19941998}
19951999
1996- /*
1997- * hlFirstIndex: find first index >= pos containing any word used in query
1998- *
1999- * Returns -1 if no such index
2000- */
2001- static int
2002- hlFirstIndex (HeadlineParsedText * prs , int pos )
2003- {
2004- int i ;
2005-
2006- for (i = pos ; i < prs -> curwords ; i ++ )
2007- {
2008- if (prs -> words [i ].item != NULL )
2009- return i ;
2010- }
2011- return -1 ;
2012- }
2013-
20142000/*
20152001 * hlCover: try to find a substring of prs' word list that satisfies query
20162002 *
2017- * At entry, *p must be the first word index to consider (initialize this
2018- * to zero, or to the next index after a previous successful search).
2019- * We will consider all substrings starting at or after that word, and
2020- * containing no more than max_cover words. (We need a length limit to
2021- * keep this from taking O(N^2) time for a long document with many query
2022- * words but few complete matches. Actually, since checkcondition_HL is
2023- * roughly O(N) in the length of the substring being checked, it's even
2024- * worse than that.)
2003+ * locations is the result of TS_execute_locations() for the query.
2004+ * We use this to identify plausible subranges of the query.
2005+ *
2006+ * *nextpos is the lexeme position (NOT word index) to start the search
2007+ * at. Caller should initialize this to zero. If successful, we'll
2008+ * advance it to the next place to search at.
20252009 *
20262010 * On success, sets *p to first word index and *q to last word index of the
20272011 * cover substring, and returns true.
@@ -2030,57 +2014,149 @@ hlFirstIndex(HeadlineParsedText *prs, int pos)
20302014 * words used in the query.
20312015 */
20322016static bool
2033- hlCover (HeadlineParsedText * prs , TSQuery query , int max_cover ,
2034- int * p , int * q )
2017+ hlCover (HeadlineParsedText * prs , TSQuery query , List * locations ,
2018+ int * nextpos , int * p , int * q )
20352019{
2036- int pmin ,
2037- pmax ,
2038- nextpmin ,
2039- nextpmax ;
2040- hlCheck ch ;
2020+ int pos = * nextpos ;
20412021
2042- /*
2043- * We look for the earliest, shortest substring of prs->words that
2044- * satisfies the query. Both the pmin and pmax indices must be words
2045- * appearing in the query; there's no point in trying endpoints in between
2046- * such points.
2047- */
2048- pmin = hlFirstIndex (prs , * p );
2049- while (pmin >= 0 )
2022+ /* This loop repeats when our selected word-range fails the query */
2023+ for (;;)
20502024 {
2051- /* This useless assignment just keeps stupider compilers quiet */
2052- nextpmin = -1 ;
2053- /* Consider substrings starting at pmin */
2054- ch .words = & (prs -> words [pmin ]);
2055- /* Consider the length-one substring first, then longer substrings */
2056- pmax = pmin ;
2057- do
2025+ int posb ,
2026+ pose ;
2027+ ListCell * lc ;
2028+
2029+ /*
2030+ * For each AND'ed query term or phrase, find its first occurrence at
2031+ * or after pos; set pose to the maximum of those positions.
2032+ *
2033+ * We need not consider ORs or NOTs here; see the comments for
2034+ * TS_execute_locations(). Rechecking the match with TS_execute(),
2035+ * below, will deal with any ensuing imprecision.
2036+ */
2037+ pose = -1 ;
2038+ foreach (lc , locations )
2039+ {
2040+ ExecPhraseData * pdata = (ExecPhraseData * ) lfirst (lc );
2041+ int first = -1 ;
2042+
2043+ for (int i = 0 ; i < pdata -> npos ; i ++ )
2044+ {
2045+ /* For phrase matches, use the ending lexeme */
2046+ int endp = pdata -> pos [i ];
2047+
2048+ if (endp >= pos )
2049+ {
2050+ first = endp ;
2051+ break ;
2052+ }
2053+ }
2054+ if (first < 0 )
2055+ return false; /* no more matches for this term */
2056+ if (first > pose )
2057+ pose = first ;
2058+ }
2059+
2060+ if (pose < 0 )
2061+ return false; /* we only get here if empty list */
2062+
2063+ /*
2064+ * Now, for each AND'ed query term or phrase, find its last occurrence
2065+ * at or before pose; set posb to the minimum of those positions.
2066+ *
2067+ * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2068+ * posb + 1 below.
2069+ */
2070+ posb = INT_MAX - 1 ;
2071+ foreach (lc , locations )
20582072 {
2059- /* Try to match query against pmin .. pmax substring */
2060- ch . len = pmax - pmin + 1 ;
2061- if ( TS_execute ( GETQUERY ( query ), & ch ,
2062- TS_EXEC_EMPTY , checkcondition_HL ) )
2073+ ExecPhraseData * pdata = ( ExecPhraseData * ) lfirst ( lc );
2074+ int last = - 1 ;
2075+
2076+ for ( int i = pdata -> npos - 1 ; i >= 0 ; i -- )
20632077 {
2064- * p = pmin ;
2065- * q = pmax ;
2066- return true;
2078+ /* For phrase matches, use the starting lexeme */
2079+ int startp = pdata -> pos [i ] - pdata -> width ;
2080+
2081+ if (startp <= pose )
2082+ {
2083+ last = startp ;
2084+ break ;
2085+ }
20672086 }
2068- /* Nope, so advance pmax to next feasible endpoint */
2069- nextpmax = hlFirstIndex (prs , pmax + 1 );
2087+ if (last < posb )
2088+ posb = last ;
2089+ }
2090+
2091+ /*
2092+ * We could end up with posb to the left of pos, in case some phrase
2093+ * match crosses pos. Try the match starting at pos anyway, since the
2094+ * result of TS_execute_locations is imprecise for phrase matches OR'd
2095+ * with plain matches; that is, if the query is "(A <-> B) | C" then C
2096+ * could match at pos even though the phrase match would have to
2097+ * extend to the left of pos.
2098+ */
2099+ posb = Max (posb , pos );
20702100
2101+ /* This test probably always succeeds, but be paranoid */
2102+ if (posb <= pose )
2103+ {
20712104 /*
2072- * If this is our first advance past pmin, then the result is also
2073- * the next feasible value of pmin; remember it to save a
2074- * redundant search.
2105+ * posb .. pose is now the shortest, earliest-after-pos range of
2106+ * lexeme positions containing all the query terms. It will
2107+ * contain all phrase matches, too, except in the corner case
2108+ * described just above.
2109+ *
2110+ * Now convert these lexeme positions to indexes in prs->words[].
20752111 */
2076- if (pmax == pmin )
2077- nextpmin = nextpmax ;
2078- pmax = nextpmax ;
2112+ int idxb = -1 ;
2113+ int idxe = -1 ;
2114+
2115+ for (int i = 0 ; i < prs -> curwords ; i ++ )
2116+ {
2117+ if (prs -> words [i ].item == NULL )
2118+ continue ;
2119+ if (idxb < 0 && prs -> words [i ].pos >= posb )
2120+ idxb = i ;
2121+ if (prs -> words [i ].pos <= pose )
2122+ idxe = i ;
2123+ else
2124+ break ;
2125+ }
2126+
2127+ /* This test probably always succeeds, but be paranoid */
2128+ if (idxb >= 0 && idxe >= idxb )
2129+ {
2130+ /*
2131+ * Finally, check that the selected range satisfies the query.
2132+ * This should succeed in all simple cases; but odd cases
2133+ * involving non-top-level NOT conditions or phrase matches
2134+ * OR'd with other things could fail, since the result of
2135+ * TS_execute_locations doesn't fully represent such things.
2136+ */
2137+ hlCheck ch ;
2138+
2139+ ch .words = & (prs -> words [idxb ]);
2140+ ch .len = idxe - idxb + 1 ;
2141+ if (TS_execute (GETQUERY (query ), & ch ,
2142+ TS_EXEC_EMPTY , checkcondition_HL ))
2143+ {
2144+ /* Match! Advance *nextpos and return the word range. */
2145+ * nextpos = posb + 1 ;
2146+ * p = idxb ;
2147+ * q = idxe ;
2148+ return true;
2149+ }
2150+ }
20792151 }
2080- while (pmax >= 0 && pmax - pmin < max_cover );
2081- /* No luck here, so try next feasible startpoint */
2082- pmin = nextpmin ;
2152+
2153+ /*
2154+ * Advance pos and try again. Any later workable match must start
2155+ * beyond posb.
2156+ */
2157+ pos = posb + 1 ;
20832158 }
2159+ /* Can't get here, but stupider compilers complain if we leave it off */
20842160 return false;
20852161}
20862162
@@ -2177,9 +2253,10 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
21772253 * it only controls presentation details.
21782254 */
21792255static void
2180- mark_hl_fragments (HeadlineParsedText * prs , TSQuery query , bool highlightall ,
2256+ mark_hl_fragments (HeadlineParsedText * prs , TSQuery query , List * locations ,
2257+ bool highlightall ,
21812258 int shortword , int min_words ,
2182- int max_words , int max_fragments , int max_cover )
2259+ int max_words , int max_fragments )
21832260{
21842261 int32 poslen ,
21852262 curlen ,
@@ -2192,6 +2269,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
21922269
21932270 int32 startpos = 0 ,
21942271 endpos = 0 ,
2272+ nextpos = 0 ,
21952273 p = 0 ,
21962274 q = 0 ;
21972275
@@ -2206,7 +2284,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
22062284 covers = palloc (maxcovers * sizeof (CoverPos ));
22072285
22082286 /* get all covers */
2209- while (hlCover (prs , query , max_cover , & p , & q ))
2287+ while (hlCover (prs , query , locations , & nextpos , & p , & q ))
22102288 {
22112289 startpos = p ;
22122290 endpos = q ;
@@ -2236,9 +2314,6 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
22362314 startpos = endpos + 1 ;
22372315 endpos = q ;
22382316 }
2239-
2240- /* move p to generate the next cover */
2241- p ++ ;
22422317 }
22432318
22442319 /* choose best covers */
@@ -2360,10 +2435,12 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
23602435 * Headline selector used when MaxFragments == 0
23612436 */
23622437static void
2363- mark_hl_words (HeadlineParsedText * prs , TSQuery query , bool highlightall ,
2364- int shortword , int min_words , int max_words , int max_cover )
2438+ mark_hl_words (HeadlineParsedText * prs , TSQuery query , List * locations ,
2439+ bool highlightall ,
2440+ int shortword , int min_words , int max_words )
23652441{
2366- int p = 0 ,
2442+ int nextpos = 0 ,
2443+ p = 0 ,
23672444 q = 0 ;
23682445 int bestb = -1 ,
23692446 beste = -1 ;
@@ -2379,7 +2456,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
23792456 if (!highlightall )
23802457 {
23812458 /* examine all covers, select a headline using the best one */
2382- while (hlCover (prs , query , max_cover , & p , & q ))
2459+ while (hlCover (prs , query , locations , & nextpos , & p , & q ))
23832460 {
23842461 /*
23852462 * Count words (curlen) and interesting words (poslen) within
@@ -2486,9 +2563,6 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
24862563 bestlen = poslen ;
24872564 bestcover = poscover ;
24882565 }
2489-
2490- /* move p to generate the next cover */
2491- p ++ ;
24922566 }
24932567
24942568 /*
@@ -2528,14 +2602,15 @@ prsd_headline(PG_FUNCTION_ARGS)
25282602 HeadlineParsedText * prs = (HeadlineParsedText * ) PG_GETARG_POINTER (0 );
25292603 List * prsoptions = (List * ) PG_GETARG_POINTER (1 );
25302604 TSQuery query = PG_GETARG_TSQUERY (2 );
2605+ hlCheck ch ;
2606+ List * locations ;
25312607
25322608 /* default option values: */
25332609 int min_words = 15 ;
25342610 int max_words = 35 ;
25352611 int shortword = 3 ;
25362612 int max_fragments = 0 ;
25372613 bool highlightall = false;
2538- int max_cover ;
25392614 ListCell * l ;
25402615
25412616 /* Extract configuration option values */
@@ -2575,15 +2650,6 @@ prsd_headline(PG_FUNCTION_ARGS)
25752650 defel -> defname )));
25762651 }
25772652
2578- /*
2579- * We might eventually make max_cover a user-settable parameter, but for
2580- * now, just compute a reasonable value based on max_words and
2581- * max_fragments.
2582- */
2583- max_cover = Max (max_words * 10 , 100 );
2584- if (max_fragments > 0 )
2585- max_cover *= max_fragments ;
2586-
25872653 /* in HighlightAll mode these parameters are ignored */
25882654 if (!highlightall )
25892655 {
@@ -2605,13 +2671,19 @@ prsd_headline(PG_FUNCTION_ARGS)
26052671 errmsg ("MaxFragments should be >= 0" )));
26062672 }
26072673
2674+ /* Locate words and phrases matching the query */
2675+ ch .words = prs -> words ;
2676+ ch .len = prs -> curwords ;
2677+ locations = TS_execute_locations (GETQUERY (query ), & ch , TS_EXEC_EMPTY ,
2678+ checkcondition_HL );
2679+
26082680 /* Apply appropriate headline selector */
26092681 if (max_fragments == 0 )
2610- mark_hl_words (prs , query , highlightall , shortword ,
2611- min_words , max_words , max_cover );
2682+ mark_hl_words (prs , query , locations , highlightall , shortword ,
2683+ min_words , max_words );
26122684 else
2613- mark_hl_fragments (prs , query , highlightall , shortword ,
2614- min_words , max_words , max_fragments , max_cover );
2685+ mark_hl_fragments (prs , query , locations , highlightall , shortword ,
2686+ min_words , max_words , max_fragments );
26152687
26162688 /* Fill in default values for string options */
26172689 if (!prs -> startsel )
0 commit comments