Elasticsearch高级搜索排序( 中文+拼音+首字母+简繁转换+特殊符号过滤)

/ 默认分类 / 没有评论 / 209浏览

一.先摆需求:

1、中文搜索、英文搜索、中英混搜   如:“成都”,“che队”

2、全拼搜索、首字母搜索、中文+全拼、中文+首字母混搜   如:“chengdu”,“cd”,“成du”,“成d”,“cheng都,“c都”等等组合

3、简繁搜索、特殊符号过滤搜索   如:“龍馬”可通过“龙马”搜索,再比如 L.G.F可以通过lgf搜索,café可能通过cafe搜索

4、排序优先级为: 以关键字开头>包含关键字


二.实现

1、索引设计

使用fields为搜索字段建立不同类型的索引,有全拼索引、首字母简写索引、Ngram索引以及IK索引,从各个角度分别击破,然后通过char-filter进行特殊符号与简繁转换。

curl -XPUT 10.28.18.78:9200/dev_index_ps_assets -d '{
   "settings" : {
      "refresh_interval" : "5s",
      "number_of_shards" : 1,
      "number_of_replicas" : 1,
      "analysis" : {
             "filter": {
                "edge_ngram_filter": { 
                    "type":     "edge_ngram",
                    "min_gram": 1,
                    "max_gram": 50                    
                },                
                "pinyin_filter":{
                    "type": "pinyin",
                    "keep_first_letter" : true,
	       "keep_separate_first_letter" : true,
                    "keep_full_pinyin" : true,
                    "keep_joined_full_pinyin":true,
                    "keep_none_chinese":true,
                    "keep_none_chinese_together":true,
                    "keep_none_chinese_in_first_letter":true,
                    "keep_none_chinese_in_joined_full_pinyin":true,
                    "none_chinese_pinyin_tokenize":true,
                    "keep_original" : false,
     	       "limit_first_letter_length" : 16,
                    "lowercase" : true,
	       "trim_whitespace" : true,
                    "first_letter": "prefix",
                    "padding_char": " "
                }
            },
            "char_filter" : {
                "charconvert" : {
                    "type" : "mapping",
                    "mappings_path":"char_filter_text"
                }
            },    
            "tokenizer":{
                "ik":{
                   "type":"ik_smart",
                   "use_smart":true                
                }
            },
            "analyzer": {
                "ngramIndexAnalyzer": {
                    "type": "custom",
                    "tokenizer": "keyword",
                    "filter": ["edge_ngram_filter","lowercase"],
                    "char_filter" : ["charconvert"]
                },
                "ngramSearchAnalyzer": {
                    "type": "custom",
                    "tokenizer": "keyword",   
                    "filter":["lowercase"],
                    "char_filter" : ["charconvert"]
                },    
                "ikAnalyzer": {
                    "type": "custom",
                    "tokenizer": "ik",                   
                    "char_filter" : ["charconvert"]
                },                 
                "pinyinAnalyzer":{                   
                    "tokenizer" : "keyword",
                    "filter": ["pinyin_filter","lowercase"]                                    
                }
            }
       }
    },
    "mappings": {       
        "type_ps_assets": {        
            "properties": {   
                "nodename": {
                    "type": "text",                    
                    "fields":{
                          "words": {
                                "type": "text",
                                "index": "analyzed",
                                "analyzer" : "ngramIndexAnalyzer"
                          },
                          "pinyin": {
                                "type": "text",
                                "index": "analyzed",
                                "analyzer" : "pinyinAnalyzer"
                          },
                          "ik": {
                                "type": "text",
                                "index": "analyzed",
                                "analyzer" : "ikAnalyzer"
                          }
                    }
                }
            }
        }
    }
}'

2、搜索构建

以下是搜索实现代码(非完整代码,只摘录核心部分,主要是思路):

/**
     * 纯中文搜索
     * @return
     */
    public List<Map> chineseSearch(String key,Integer cityId) throws Exception{
        DisMaxQueryBuilder  disMaxQueryBuilder=QueryBuilders.disMaxQuery();
        //以关键字开头(优先级最高)
        MatchQueryBuilder q1=QueryBuilders.matchQuery("words",key).analyzer("ngramSearchAnalyzer").boost(5);        
        //完整包含经过分析过的关键字
//         boolean  whitespace=key.contains(" ");
//         int slop=whitespace?50:5;
        QueryBuilder q2=QueryBuilders.matchQuery("words.IKS", key).analyzer("ikSearchAnalyzer").minimumShouldMatch("100%");
        disMaxQueryBuilder.add(q1);
        disMaxQueryBuilder.add(q2);
        SearchQuery searchQuery=builderQuery(cityId,disMaxQueryBuilder);
        return  elasticsearchTemplate.queryForList(searchQuery,Map.class);
    }

/** * 混合搜索 * @return */ public List<Map> chineseWithEnglishOrPinyinSearch(String key,Integer cityId) throws Exception{

    DisMaxQueryBuilder  disMaxQueryBuilder=QueryBuilders.disMaxQuery();
    //是否有中文开头,有则返回中文前缀
    String startChineseString=commonSearchService.getStartChineseString(key);        
    /**
     * 源值搜索,不做拼音转换    
     * 权重* 1.5
     */        
    QueryBuilder normSearchBuilder=QueryBuilders.matchQuery("words",key).analyzer("ngramSearchAnalyzer").boost(5f);        
/**
 * 拼音简写搜索
 * 1、分析key,转换为简写  case:  南京东路==&amp;gt;njdl,南京dl==&amp;gt;njdl,njdl==&amp;gt;njdl
 * 2、搜索匹配,必须完整匹配简写词干
 * 3、如果有中文前缀,则排序优先
 * 权重*1
 */
String analysisKey=commonSearchService.anaysisKeyAndGetMaxWords(SearchIndex.INDEX_NAME_SEARCHWORDSSTATISTICS,key,"pinyiSimpleSearchAnalyzer");
QueryBuilder pingYinSampleQueryBuilder=QueryBuilders.termQuery("words.SPY", analysisKey);

/**
 * 拼音简写包含匹配,如 njdl可以查出 "城市公牛 南京东路店",虽然非南京东路开头
 * 权重*0.8
 */
QueryBuilder  pingYinSampleContainQueryBuilder=null;
if(analysisKey.length()&amp;gt;1){
    pingYinSampleContainQueryBuilder=QueryBuilders.wildcardQuery("words.SPY", "*"+analysisKey+"*").boost(0.8f);
}

/**
 * 拼音全拼搜索
 * 1、分析key,获取拼音词干   case :  南京东路==&amp;gt;[nan,jing,dong,lu],南京donglu==&amp;gt;[nan,jing,dong,lu]
 * 2、搜索查询,必须匹配所有拼音词,如南京东路,则nan,jing,dong,lu四个词干必须完全匹配
 * 3、如果有中文前缀,则排序优先  
 * 权重*1
 */
QueryBuilder pingYinFullQueryBuilder=null;
if(key.length()&amp;gt;1){
    pingYinFullQueryBuilder=QueryBuilders.matchPhraseQuery("words.FPY", key).analyzer("pinyiFullSearchAnalyzer");    
}

/**
 * 完整包含关键字查询(优先级最低,只有以上四种方式查询无结果时才考虑)
 * 权重*0.8
 */
QueryBuilder containSearchBuilder=QueryBuilders.matchQuery("words.IKS", key).analyzer("ikSearchAnalyzer").minimumShouldMatch("100%");
        
disMaxQueryBuilder
.add(normSearchBuilder)
.add(pingYinSampleQueryBuilder)    
.add(containSearchBuilder);

//以下两个对性能有一定的影响,故作此判定,单个字符不执行此类搜索
if(pingYinFullQueryBuilder!=null){
    disMaxQueryBuilder.add(pingYinFullQueryBuilder);
}
if(pingYinSampleContainQueryBuilder!=null){
    disMaxQueryBuilder.add(pingYinSampleContainQueryBuilder);
}        

QueryBuilder queryBuilder=disMaxQueryBuilder;

//关键如果有中文,则必须包含在内容中
if(StringUtils.isNotBlank(startChineseString)){
    queryBuilder=    QueryBuilders.filteredQuery(disMaxQueryBuilder,
            FilterBuilders.queryFilter(QueryBuilders.queryStringQuery("*"+startChineseString+"*").field("words").analyzer("ngramSearchAnalyzer")));
    queryBuilder=QueryBuilders.functionScoreQuery(queryBuilder)
    .add(FilterBuilders.queryFilter(QueryBuilders.matchQuery("words",startChineseString).analyzer("ngramSearchAnalyzer")), ScoreFunctionBuilders.weightFactorFunction(1.5f));
}                

SearchQuery searchQuery=builderQuery(cityId,queryBuilder);

return  elasticsearchTemplate.queryForList(searchQuery,Map.class);

}

3.使用jest注意事项,查询字段定义pinyin和ik(拼音和中文)需要前置确定查询字段后缀,如:xx.pinyin或xx.ik

public final class EsUtils {
private static final String QUERY_TYPE_PINYIN = ".pinyin";
private static final String QUERY_TYPE_IK = ".ik";

/**

  • 获取查询类型
  • @param str
  • @return */ public static String getQueryType(String str){ String result = QUERY_TYPE_PINYIN; if (ChineseCharToEn.isChinese(str)){ result = QUERY_TYPE_IK; } return result; }
调用示例

    public List<Map<String, Object>> queryEmployeesList(PersonnelReq req) throws BusinessException {

        String value = req.getValue();

        Asserts.notNull(value, "内容不能为空!");


        List<Criteria> criterias = new ArrayList<>();

        Criteria criteria = new Criteria("employeeName"+EsUtils.getQueryType(value),value,true);

        criterias.add(criteria);


        return esDao.highlightedSearch(EsDB.INDEX_PS_EMPLOYEES,EsDB.TYPE_PS_EMPLOYEES, criterias,Map.class,1,20);

    }