query-14fd71611f6af350c372b85522871750

rq turtle/ttl

Propertiesmaintained by WikiProject (P6104)title (P1476)KIT Linked Open Numbers ID (P5176)numeric value (P1181)

Use at

PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX bd: <http://www.bigdata.com/rdf#>
PREFIX target: <http://www.wikidata.org/entity/Q56241615>
SELECT DISTINCT ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle





WHERE {

{ # Applying the regexes to the titles to extract ngrams, and counting occurrences of the ngrams across titles
  SELECT 
    DISTINCT ?Ngram 
    ?N
    (COUNT(DISTINCT ?Title) AS ?Count)
    ?Length
    ?Dashes
    (( ?Count * ?Length * ( (?Dashes +1) / ?N) 
     ) AS ?Score)
    (SAMPLE(DISTINCT ?Publication) AS ?ExamplePub)
      { 

{ # Generating a list of regexes to look for the NumericValue-th word in a string     
  # Based on https://w.wiki/KG$ by Jura1
  SELECT ?Regex1 ?Regex2 ?Regex3 ?Regex4 ?NumericValue 
    { 
      ?NumberItem wdt:P5176 []; wdt:P1181 ?NumericValue . 
      FILTER( ?NumericValue > 0 ) 
      FILTER( ?NumericValue < 151)
      BIND("^([^ ]+ ){" AS ?RegexStart)
      BIND("}([^ ]+) .*" AS ?RegexEnd)
      BIND( CONCAT( ?RegexStart , STR( ?NumericValue - 1 ), ?RegexEnd ) AS ?Regex1)
      BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 1 ), ?RegexEnd ) AS ?Regex2) 
      BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 3 ), ?RegexEnd ) AS ?Regex3) 
      BIND( CONCAT( ?RegexStart , STR( ?NumericValue + 5 ), ?RegexEnd ) AS ?Regex4) 
    }
}        
{ # Preprocessing the titles
  SELECT ?Title ?Publication ?Seeds ?ClearTitleLength
   { 
      INCLUDE %items
      ?Publication wdt:P1476 ?Title.
      BIND (REPLACE(STR(?Title),"[\\.:,;\\[\\]\\?()$]","") AS ?ClearTitle) # remove some frequent special characters, including colons and semicolons
      BIND(STRLEN(?ClearTitle) AS ?ClearTitleLength) 
      FILTER(LANG(?Title)="en") 
      FILTER REGEX(LCASE(?Title), "\\bhabitat(s?)\\b").
      # Basic processing of the titles
      BIND ("::: ::: ::: ::: ::: ::: ::: ::: " AS ?StartCodon)
      BIND (" ;;; ;;; ;;; ;;; ;;; ;;; ;;; ;;;" AS ?StopCodon)
      BIND (LCASE(CONCAT(?StartCodon , # add start codon of colons to assist with processing of n-grams at beginning of title
                            ?ClearTitle, 
                            ?StopCodon)) # add stop codon of semicolons to assist with processing of n-grams at end of title
                     AS ?Seeds )
   }
}        BIND( 
          (CONCAT(
            REPLACE(?Seeds, ?Regex1, "$1"), " ", 
            REPLACE(?Seeds, ?Regex1, "$2"), " ", 
            REPLACE(?Seeds, ?Regex2, "$1"), " ", 
            REPLACE(?Seeds, ?Regex2, "$2"), " ", 
            REPLACE(?Seeds, ?Regex3, "$1"), " ", 
            REPLACE(?Seeds, ?Regex3, "$2"), " ", 
            REPLACE(?Seeds, ?Regex4, "$1"), " ", 
            REPLACE(?Seeds, ?Regex4, "$2")
          )
        ) AS ?NgramCandidate) 

        BIND( 
          (REPLACE
           (REPLACE
            (REPLACE
             (REPLACE
              (STR(?NgramCandidate),"([;:])",""),
              "(^\\s+)",""),
             "(\\s+$)",""),
            "([ ]{2,})"," ")
          ) AS ?Ngram) 

        BIND(STRLEN(?Ngram) AS ?Length) 
        FILTER (?Length > 3 )  
        FILTER (?Length <= ?ClearTitleLength )  

        BIND(STRLEN(REPLACE(?Ngram, "\\S", "")) + 1 as ?N)
        BIND((STRLEN(?Ngram) - STRLEN(REPLACE(?Ngram, "-", "")))  as ?Dashes)
      }
  GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub
#   HAVING(?Count > 1)
}
  # Exclude Ngrams starting or ending with any of a set of blacklisted words
  BIND("(a|and|between|during|for|from|in|of|on|or|the|to|with)" AS ?blacklist)
  BIND( CONCAT( "(^", ?blacklist ,")+( )+") AS ?RegexBlackStart)
  BIND( CONCAT( "( )+(", ?blacklist ,")+$") AS ?RegexBlackEnd)
  FILTER (!REGEX(?Ngram, ?RegexBlackStart))
  FILTER (!REGEX(?Ngram, ?RegexBlackEnd))

  ?ExamplePub wdt:P1476 ?ExamplePubTitle.
  FILTER(LANG(?ExamplePubTitle)="en") 
}
GROUP BY ?Ngram ?N ?Count ?Length ?Dashes ?Score ?ExamplePub ?ExamplePubTitle
ORDER BY DESC(?Score) DESC(?Count) DESC(?Length)
LIMIT 200

Query found at