query-5b0574efc5e50a52239b0a7c95b34573

rq turtle/ttl

Checking whether strings from the titles of publications already exist as lexemes

The query has three parts:

I - get a list of publications on a given topic

II - extract strings from the titles

III - check whether these strings exist as Wikidata lexemes

SELECT DISTINCT ?word ?wordUrl ?form ?formLabel ?lexeme ?lexemeLabel ?lexical_category ?lexical_categoryLabel (GROUP_CONCAT(DISTINCT ?featureLabel; separator=" // ") AS ?features) ?sense ?senseLabel (IRI(CONCAT("https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file&width=100&wpvalue=", SUBSTR(STR(SAMPLE(?images)), 52))) AS ?sense_image) WHERE {

I - get a list of publications on a given topic

{ SELECT DISTINCT ?x ?title WHERE { ?x wdt:P921 wd:Q202864 ; # Zika virus wdt:P1476 ?title. FILTER(STRLEN(?title) >= 6) } LIMIT 10 }

II - extract strings from the titles

BIND(LCASE(?title) AS ?ltitle) BIND(REPLACE(?ltitle, "^.?(\b\w{6,}\b).$", "$1") AS ?w1) BIND(REPLACE(STRAFTER(?ltitle, ?w1), "^.?(\b\w{6,}\b).$", "$1") AS ?w2) BIND(REPLACE(STRAFTER(?ltitle, ?w2), "^.?(\b\w{6,}\b).$", "$1") AS ?w3) VALUES ?w_ { 1 2 3 } BIND(IF(?w_ = 1, ?w1, IF(?w_ = 2, ?w2, ?w3)) AS ?word) FILTER(REGEX(?word, "^\w+$")) # since ?w may evaluate to an empty string, e.g. for one-word titles

FILTER (LANG(?word) = "en")

III - check whether these strings exist as Wikidata lexemes

This part is taken from https://tools.wmflabs.org/ordia/text-to-lexemes

OPTIONAL { ?form ontolex:representation ?word . OPTIONAL { ?form wikibase:grammaticalFeature ?feature . BIND(STR(?feature) AS ?default_featureLabel) OPTIONAL { ?feature rdfs:label ?featureLabel_ . FILTER (LANG(?featureLabel_) = "en") } BIND(COALESCE(?featureLabel_, ?default_featureLabel) AS ?featureLabel) } ?form ontolex:representation ?formLabel .

START OF PROBLEMATIC SECTION

?lexeme ontolex:lexicalForm ?form .

?lexeme wikibase:lexicalCategory ?lexical_category .

BIND(STR(?lexical_category) AS ?default_lexical_categoryLabel)

OPTIONAL {

?lexical_category rdfs:label ?lexical_categoryLabel_ .

FILTER (LANG(?lexical_categoryLabel_) = 'en')

}

BIND(COALESCE(?lexical_categoryLabel_, ?default_lexical_categoryLabel) AS

?lexical_categoryLabel)

?lexeme wikibase:lemma ?lexemeLabel .

OPTIONAL {

?lexeme ontolex:sense ?sense .

BIND(SUBSTR(STR(?sense), 32) AS ?senseLabel)

OPTIONAL {

?sense wdt:P18 ?images .

}

}

END OF PROBLEMATIC SECTION

} BIND(IF(BOUND(?form), "", CONCAT("search?language=en&q=", ?word)) AS ?wordUrl)

} GROUP BY ?word ?wordUrl ?form ?formLabel ?lexeme ?lexemeLabel ?lexical_category ?lexical_categoryLabel ?sense ?senseLabel ORDER BY ?word

Use at

PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
################
# Checking whether strings from the titles of publications already exist as lexemes
# The query has three parts:
#   I - get a list of publications on a given topic
#  II - extract strings from the titles
# III - check whether these strings exist as Wikidata lexemes
################

SELECT DISTINCT
  ?word ?wordUrl
  ?form ?formLabel
  ?lexeme ?lexemeLabel
  ?lexical_category ?lexical_categoryLabel
  (GROUP_CONCAT(DISTINCT ?featureLabel; separator=" // ") AS ?features)
  ?sense ?senseLabel
  (IRI(CONCAT("https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file&width=100&wpvalue=", 
          SUBSTR(STR(SAMPLE(?images)), 52))) AS ?sense_image)
WHERE {

#   I - get a list of publications on a given topic

  {
    SELECT DISTINCT ?x ?title WHERE {
      ?x wdt:P921 wd:Q202864 ;  # Zika virus
         wdt:P1476 ?title.
      FILTER(STRLEN(?title) >= 6)
    }
    LIMIT 10
  }

#  II - extract strings from the titles

  BIND(LCASE(?title) AS ?ltitle)
  BIND(REPLACE(?ltitle, "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w1)
  BIND(REPLACE(STRAFTER(?ltitle, ?w1), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w2)
  BIND(REPLACE(STRAFTER(?ltitle, ?w2), "^.*?(\\b\\w{6,}\\b).*$", "$1") AS ?w3)
  VALUES ?w_ { 1 2 3 }
  BIND(IF(?w_ = 1, ?w1, IF(?w_ = 2, ?w2, ?w3)) AS ?word)
  FILTER(REGEX(?word, "^\\w+$")) # since ?w may evaluate to an empty string, e.g. for one-word titles

  FILTER (LANG(?word) = "en")

# III - check whether these strings exist as Wikidata lexemes
# This part is taken from https://tools.wmflabs.org/ordia/text-to-lexemes

  OPTIONAL {
    ?form ontolex:representation ?word . 
    OPTIONAL {
      ?form wikibase:grammaticalFeature ?feature .
      BIND(STR(?feature) AS ?default_featureLabel)
      OPTIONAL {
        ?feature rdfs:label ?featureLabel_ .
        FILTER (LANG(?featureLabel_) = "en")
      }
      BIND(COALESCE(?featureLabel_, ?default_featureLabel) AS ?featureLabel)
    }
    ?form ontolex:representation ?formLabel .

# START OF PROBLEMATIC SECTION    
#    ?lexeme ontolex:lexicalForm ?form .
#
#    ?lexeme wikibase:lexicalCategory ?lexical_category .
#    BIND(STR(?lexical_category) AS ?default_lexical_categoryLabel)
#    OPTIONAL {
#      ?lexical_category rdfs:label ?lexical_categoryLabel_ .
#      FILTER (LANG(?lexical_categoryLabel_) = 'en')
#    }
#    BIND(COALESCE(?lexical_categoryLabel_, ?default_lexical_categoryLabel) AS
#    ?lexical_categoryLabel)
#      
#     
#    ?lexeme wikibase:lemma ?lexemeLabel .
#
#    OPTIONAL {
#      ?lexeme ontolex:sense ?sense .
#      BIND(SUBSTR(STR(?sense), 32) AS ?senseLabel)
#      OPTIONAL {
#        ?sense wdt:P18 ?images .
#      }
#    }
# END OF PROBLEMATIC SECTION    

  }
  BIND(IF(BOUND(?form), "", CONCAT("search?language=en&q=", ?word)) AS ?wordUrl)

}
GROUP BY
  ?word ?wordUrl ?form ?formLabel
  ?lexeme ?lexemeLabel ?lexical_category ?lexical_categoryLabel
  ?sense ?senseLabel
ORDER BY ?word

Query found at

graph TD classDef projected fill:lightgreen; classDef literal fill:orange; classDef iri fill:yellow; v13("?default_featureLabel") v12("?feature") v14("?featureLabel"):::projected v11("?featureLabel_") v17("?features") v9("?form"):::projected v10("?formLabel"):::projected v16("?images") v4("?ltitle") v18("?sense_image") v2("?title") v5("?w1") v6("?w2") v7("?w3") v8("?w_") v9("?word"):::projected v15("?wordUrl"):::projected v3("?x") c5(["wd:Q202864"]):::iri f0[["?word = 'en'"]] f0 --> v9 f1[["regex(?word,'^\w+$')"]] f1 --> v9 f2[["string-length(?title) >= '6^^xsd:integer'"]] f2 --> v2 v3 --"wdt:P921"--> c5 v3 --"wdt:P1476"--> v2 bind3[/"lower-case(?title)"/] v2 --o bind3 bind3 --as--o v4 bind4[/"replace(?ltitle,'^.*?(\b\w{6,}\b).*$','$1')"/] v4 --o bind4 bind4 --as--o v5 bind5[/"replace(substring-after(?ltitle,?w1),'^.*?(\b\w{6,}\b).*$','$1')"/] v4 --o bind5 v5 --o bind5 bind5 --as--o v6 bind6[/"replace(substring-after(?ltitle,?w2),'^.*?(\b\w{6,}\b).*$','$1')"/] v4 --o bind6 v6 --o bind6 bind6 --as--o v7 bind7[/VALUES ?w_/] bind7-->v8 bind70(["1^^xsd:integer"]) bind70 --> bind7 bind71(["2^^xsd:integer"]) bind71 --> bind7 bind72(["3^^xsd:integer"]) bind72 --> bind7 bind8[/"if(?w_ = '1^^xsd:integer',?w1,if(?w_ = '2^^xsd:integer',?w2,?w3))"/] v8 --o bind8 v5 --o bind8 v6 --o bind8 v7 --o bind8 bind8 --as--o v9 subgraph optional0["(optional)"] style optional0 fill:#bbf,stroke-dasharray: 5 5; v9 -."ontolex:representation".-> v9 v9 --"ontolex:representation"--> v10 subgraph optional1["(optional)"] style optional1 fill:#bbf,stroke-dasharray: 5 5; v9 -."wikibase:grammaticalFeature".-> v12 bind9[/"str(?feature)"/] v12 --o bind9 bind9 --as--o v13 subgraph optional2["(optional)"] style optional2 fill:#bbf,stroke-dasharray: 5 5; v12 -."rdfs:label".-> v11 end bind10[/"?featureLabel_?default_featureLabel"/] v11 --o bind10 v13 --o bind10 bind10 --as--o v14 end end bind11[/"if(bound(?form),'',concat('search?language=en&q=',?word))"/] v9 --o bind11 v9 --o bind11 bind11 --as--o v15 bind14[/"?featureLabel"/] v14 --o bind14 bind14 --as--o v17 bind15[/"concat('https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file&width=100&wpvalue=',substring(str(),'52^^xsd:integer'))"/] null --o bind15 bind15 --as--o v18