1-  // Licensed to the .NET Foundation under one or more agreements. 
1+ // Licensed to the .NET Foundation under one or more agreements. 
22// The .NET Foundation licenses this file to you under the MIT license. 
33
44using  System . Collections . Immutable ; 
@@ -18,6 +18,8 @@ partial class ExtractSearchIndex : IPostProcessor
1818    [ GeneratedRegex ( @"\s+" ) ] 
1919    private  static   partial  Regex  s_regexWhiteSpace ( ) ; 
2020
21+     private  static   readonly  Regex  s_regexCase  =  new ( @"[a-z0-9]+|[A-Z0-9]+[a-z0-9]*|[0-9]+" ,  RegexOptions . Compiled ) ; 
22+ 
2123    private  static   readonly  HashSet < string >  s_htmlInlineTags  =  new ( StringComparer . OrdinalIgnoreCase ) 
2224    { 
2325        "a" ,  "area" ,  "del" ,  "ins" ,  "link" ,  "map" ,  "meta" ,  "abbr" ,  "audio" ,  "b" ,  "bdo" ,  "button" ,  "canvas" ,  "cite" ,  "code" ,  "command" ,  "data" , 
@@ -29,12 +31,20 @@ partial class ExtractSearchIndex : IPostProcessor
2931    public  string  Name  =>  nameof ( ExtractSearchIndex ) ; 
3032    public  const  string  IndexFileName  =  "index.json" ; 
3133
34+     internal  bool  UseMetadata  {  get ;  set ;  }  =  false ; 
35+     internal  bool  UseMetadataTitle  {  get ;  set ;  }  =  true ; 
36+ 
3237    public  ImmutableDictionary < string ,  object >  PrepareMetadata ( ImmutableDictionary < string ,  object >  metadata ) 
3338    { 
3439        if  ( ! metadata . ContainsKey ( "_enableSearch" ) ) 
3540        { 
3641            metadata  =  metadata . Add ( "_enableSearch" ,  true ) ; 
3742        } 
43+ 
44+         UseMetadata  =  metadata . TryGetValue ( "_searchIndexUseMetadata" ,  out  var  useMetadataObject )  &&  ( bool ) useMetadataObject ; 
45+         UseMetadataTitle  =  ! metadata . TryGetValue ( "_searchIndexUseMetadataTitle" ,  out  var  useMetadataTitleObject )  ||  ( bool ) useMetadataTitleObject ; 
46+ 
47+         Logger . LogInfo ( $ "{ Name } : { nameof ( UseMetadata ) }  = { UseMetadata } , { nameof ( UseMetadataTitle ) }  = { UseMetadataTitle } ") ; 
3848        return  metadata ; 
3949    } 
4050
@@ -49,14 +59,15 @@ public Manifest Process(Manifest manifest, string outputFolder, CancellationToke
4959        var  htmlFiles  =  ( from  item  in  manifest . Files  ??  Enumerable . Empty < ManifestItem > ( ) 
5060                         from  output  in  item . Output 
5161                         where  item . Type  !=  "Toc"  &&  output . Key . Equals ( ".html" ,  StringComparison . OrdinalIgnoreCase ) 
52-                          select  output . Value . RelativePath ) . ToList ( ) ; 
62+                          select  ( output . Value . RelativePath ,  item . Metadata ) ) . ToList ( ) ; 
63+ 
5364        if  ( htmlFiles . Count  ==  0 ) 
5465        { 
5566            return  manifest ; 
5667        } 
5768
5869        Logger . LogInfo ( $ "Extracting index data from { htmlFiles . Count }  html files") ; 
59-         foreach  ( var  relativePath  in  htmlFiles ) 
70+         foreach  ( ( string  relativePath ,   Dictionary < string ,   object >   metadata )  in  htmlFiles ) 
6071        { 
6172            cancellationToken . ThrowIfCancellationRequested ( ) ; 
6273
@@ -76,7 +87,7 @@ from output in item.Output
7687                    Logger . LogWarning ( $ "Warning: Can't load content from { filePath } : { ex . Message } ") ; 
7788                    continue ; 
7889                } 
79-                 var  indexItem  =  ExtractItem ( html ,  relativePath ) ; 
90+                 var  indexItem  =  ExtractItem ( html ,  relativePath ,   metadata ) ; 
8091                if  ( indexItem  !=  null ) 
8192                { 
8293                    indexData [ relativePath ]  =  indexItem ; 
@@ -99,7 +110,7 @@ from output in item.Output
99110        return  manifest ; 
100111    } 
101112
102-     internal  SearchIndexItem  ExtractItem ( HtmlDocument  html ,  string  href ) 
113+     internal  SearchIndexItem  ExtractItem ( HtmlDocument  html ,  string  href ,   Dictionary < string ,   object >   metadata   =   null ) 
103114    { 
104115        var  contentBuilder  =  new  StringBuilder ( ) ; 
105116
@@ -117,10 +128,37 @@ internal SearchIndexItem ExtractItem(HtmlDocument html, string href)
117128            ExtractTextFromNode ( node ,  contentBuilder ) ; 
118129        } 
119130
120-         var  content  =  NormalizeContent ( contentBuilder . ToString ( ) ) ; 
121-         var  title  =  ExtractTitleFromHtml ( html ) ; 
131+         string  title ; 
132+         string  summary  =  null ; 
133+         string  keywords  =  null ; 
122134
123-         return  new  SearchIndexItem  {  Href  =  href ,  Title  =  title ,  Keywords  =  content  } ; 
135+         var  isMRef  =  metadata  !=  null  &&  metadata . TryGetValue ( "IsMRef" ,  out  var  isMRefMetadata )  &&  ( bool ) isMRefMetadata ; 
136+         if  ( UseMetadata  &&  isMRef ) 
137+         { 
138+             title  =  UseMetadataTitle 
139+                 ?  ( string ) metadata [ "Title" ]  ??  ExtractTitleFromHtml ( html ) 
140+                 :  ExtractTitleFromHtml ( html ) ; 
141+ 
142+             var  htmlSummary  =  ( string ) metadata [ "Summary" ] ; 
143+             if  ( ! string . IsNullOrEmpty ( htmlSummary ) ) 
144+             { 
145+                 var  htmlDocument  =  new  HtmlDocument ( ) ; 
146+                 htmlDocument . LoadHtml ( htmlSummary ) ; 
147+                 var  htmlRootNode  =  htmlDocument . DocumentNode . FirstChild ; 
148+                 var  summaryBuilder  =  new  StringBuilder ( ) ; 
149+                 ExtractTextFromNode ( htmlRootNode ,  summaryBuilder ) ; 
150+                 summary  =  NormalizeContent ( summaryBuilder . ToString ( ) ) ; 
151+             } 
152+ 
153+             keywords  =  string . Join ( ' ' ,  title . Split ( ' ' ) . Select ( word =>  string . Join ( ' ' ,  GetStemAggregations ( word . Split ( '.' ) [ ^ 1 ] ) ) ) ) ; 
154+         } 
155+         else 
156+         { 
157+             title  =  ExtractTitleFromHtml ( html ) ; 
158+             summary  =  NormalizeContent ( contentBuilder . ToString ( ) ) ; 
159+         } 
160+ 
161+         return  new  SearchIndexItem  {  Href  =  href ,  Title  =  title ,  Summary  =  summary ,  Keywords  =  keywords  } ; 
124162    } 
125163
126164    private  static   string  ExtractTitleFromHtml ( HtmlDocument  html ) 
@@ -140,6 +178,41 @@ private static string NormalizeContent(string str)
140178        return  s_regexWhiteSpace ( ) . Replace ( str ,  " " ) . Trim ( ) ; 
141179    } 
142180
181+     private  static   string [ ]  GetStems ( string  str ) 
182+     { 
183+         if  ( string . IsNullOrEmpty ( str ) ) 
184+         { 
185+             return  [ string . Empty ] ; 
186+         } 
187+         str  =  WebUtility . HtmlDecode ( str ) ; 
188+         return  s_regexCase . Matches ( str ) . Select ( m =>  m . Value ) . ToArray ( ) ; 
189+     } 
190+ 
191+     private  static   List < string >  GetStemAggregations ( string  str ) 
192+     { 
193+         var  stems  =  GetStems ( str ) ; 
194+ 
195+         var  results  =  new  List < string > ( ) ; 
196+         Aggregate ( stems ,  [ ] ,  results ,  0 ) ; 
197+         return  results ; 
198+ 
199+         static   void  Aggregate ( string [ ]  input ,  List < string >  current ,  List < string >  results ,  int  index ) 
200+         { 
201+             if  ( index  ==  input . Length ) 
202+             { 
203+                 return ; 
204+             } 
205+ 
206+             for  ( int  i  =  index ;  i  <  input . Length ;  i ++ ) 
207+             { 
208+                 current . Add ( input [ i ] ) ; 
209+                 results . Add ( string . Join ( string . Empty ,  current ) ) ; 
210+                 Aggregate ( input ,  current ,  results ,  i  +  1 ) ; 
211+                 current . RemoveAt ( current . Count  -  1 ) ; 
212+             } 
213+         } 
214+     } 
215+ 
143216    private  static   void  ExtractTextFromNode ( HtmlNode  node ,  StringBuilder  contentBuilder ) 
144217    { 
145218        if  ( node  ==  null ) 
0 commit comments