diff --git a/scripts/README.md b/scripts/README.md index 566f293d59..73ae12137e 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -31,6 +31,18 @@ properties. Read the docs in [logseq.tasks.db-graph.create-graph](src/logseq/tasks/db_graph/create_graph.cljs) for specifics on the EDN map. +Another example is the `create_graph_with_schema_org.cljs` script which creates a graph +with the https://schema.org/ ontology with as many of the classes and properties as possible: + +``` +$ yarn -s nbb-logseq src/logseq/tasks/db_graph/create_graph_with_schema_org.cljs schema +Skipping 67 superseded properties +Skipping 25 properties with unsupported data types +Renaming 44 classes due to page name conflicts +Generating 2268 pages with 900 classes and 1368 properties ... +Created graph schema! +``` + #### Update graph scripts For database graphs, it is possible to update graphs with the diff --git a/scripts/src/logseq/tasks/db_graph/create_graph.cljs b/scripts/src/logseq/tasks/db_graph/create_graph.cljs index c10060b669..ebaa53f19c 100644 --- a/scripts/src/logseq/tasks/db_graph/create_graph.cljs +++ b/scripts/src/logseq/tasks/db_graph/create_graph.cljs @@ -105,8 +105,10 @@ (throw (ex-info (str "No :db/id for property '" prop-name "'") {:property prop-name})))}) (keys properties))) -(def block-count (atom 100001)) -(def new-db-id #(swap! block-count inc)) +(def current-db-id (atom 0)) +(def new-db-id + "Provides the next temp :db/id to use in a create-graph transact!" + #(swap! current-db-id dec)) (defn- ->block-tx [m uuid-maps property-db-ids page-id last-block] (let [property-refs (when (seq (:properties m)) @@ -160,21 +162,25 @@ (assoc :blocks (mapv #(merge {:block/uuid (random-uuid)} %) blocks)))) pages-and-blocks) {:keys [property-uuids] :as uuid-maps} (create-uuid-maps pages-and-blocks' properties) + property-db-ids (->> property-uuids + (map #(vector (name (first %)) (new-db-id))) + (into {})) created-at (js/Date.now) new-properties-tx (mapv (fn [[prop-name uuid]] - {:db/id (new-db-id) - :block/uuid uuid - :block/schema (merge {:type :default} - (get-in properties [prop-name :block/schema])) - :block/original-name (name prop-name) - :block/name (string/lower-case (name prop-name)) - :block/type "property" - :block/created-at created-at - :block/updated-at created-at}) + (merge {:db/id (or (property-db-ids (name prop-name)) + (throw (ex-info "No :db/id for property" {:property prop-name}))) + :block/uuid uuid + :block/schema (merge {:type :default} + (get-in properties [prop-name :block/schema])) + :block/original-name (name prop-name) + :block/name (string/lower-case (name prop-name)) + :block/type "property" + :block/created-at created-at + :block/updated-at created-at} + (when-let [props (not-empty (get-in properties [prop-name :properties]))] + {:block/properties (->block-properties-tx props uuid-maps) + :block/refs (build-property-refs props property-db-ids)}))) property-uuids) - property-db-ids (->> new-properties-tx - (map (juxt :block/original-name :db/id)) - (into {})) pages-and-blocks-tx (vec (mapcat diff --git a/scripts/src/logseq/tasks/db_graph/create_graph_with_schema_org.cljs b/scripts/src/logseq/tasks/db_graph/create_graph_with_schema_org.cljs index 027e284e12..579e717431 100644 --- a/scripts/src/logseq/tasks/db_graph/create_graph_with_schema_org.cljs +++ b/scripts/src/logseq/tasks/db_graph/create_graph_with_schema_org.cljs @@ -1,6 +1,13 @@ (ns logseq.tasks.db-graph.create-graph-with-schema-org "Script that converts the jsonld version of schema.org into Logseq classes and - properties. Initially works with 900 classes and 1368 properties" + properties. Initially works with 900 classes and 1368 properties! The script + currently provides the following in a Logseq graph: + * All schema.org classes with their name, url, parent class (namespace) and properties + * Some classes are renamed due to naming conflicts + * All properties with their property type, url, description + * Some properties are skipped because they are superseded/deprecated or because they have a property + type logseq doesnt' support yet + * schema.org assumes no cardinality. For now, only :object properties are given a :cardinality :many" (:require [logseq.tasks.db-graph.create-graph :as create-graph] [clojure.string :as string] [datascript.core :as d] @@ -12,9 +19,6 @@ [clojure.walk :as w] [babashka.cli :as cli])) -(def current-db-id (atom 0)) -(def new-db-id #(swap! current-db-id dec)) - (defn- get-class-db-id [class-db-ids class-id] (or (class-db-ids class-id) ;; Map of owl:equivalentClass exceptions @@ -38,13 +42,15 @@ (cond-> {:block/original-name (string/replace-first (class-m "@id") "schema:" "") :block/type "class" :block/uuid (get-class-uuid class-uuids (class-m "@id")) - :db/id (get-class-db-id class-db-ids (class-m "@id"))} + :db/id (get-class-db-id class-db-ids (class-m "@id")) + :properties {:url (string/replace-first (class-m "@id") "schema:" "https://schema.org/")}} parent-class (assoc :block/namespace {:db/id (get-class-db-id class-db-ids parent-class)}) (seq properties) (assoc :block/schema {:properties (mapv property-uuids properties)})))) (def schema->logseq-data-types + "Schema datatypes, https://schema.org/DataType, mapped to their Logseq equivalents" {"schema:Integer" :number "schema:Float" :number "schema:Number" :number @@ -53,7 +59,11 @@ "schema:Boolean" :checkbox "schema:Date" :date}) -(defn- ->property-page [property-m prop-uuid class-map class-uuids] +(def unsupported-data-types + "Schema datatypes, https://schema.org/DataType, that don't have Logseq equivalents" + #{"schema:Time" "schema:DateTime"}) + +(defn- ->property-page [property-m prop-uuid class-map class-uuids {:keys [verbose]}] (let [range-includes (as-> (property-m "schema:rangeIncludes") range-includes* (map (fn [m] (m "@id")) (if (map? range-includes*) [range-includes*] range-includes*))) @@ -61,8 +71,13 @@ schema-type (some #(or (schema->logseq-data-types %) (when (class-map %) :object)) range-includes) + _ (when (and verbose (> (count range-includes) 1)) + (println "Picked first property type for" (pr-str (property-m "@id")))) _ (assert schema-type (str "No schema found for property " (property-m "@id"))) schema (cond-> {:type schema-type} + ;; This cardinality rule should be adjusted as we use schema.org more + (= schema-type :object) + (assoc :cardinality :many) (property-m "rdfs:comment") (assoc :description (property-m "rdfs:comment")) (= schema-type :object) @@ -70,11 +85,8 @@ (throw (ex-info (str "No uuids found for range(s): " range-includes) {})))))] {(keyword (string/replace-first (property-m "@id") "schema:" "")) {:block/uuid prop-uuid - :block/schema schema}})) - -(def unsupported-data-types - "Schema datatypes, https://schema.org/DataType, that don't have Logseq equivalents" - #{"schema:Time" "schema:DateTime"}) + :block/schema schema + :properties {:url (string/replace-first (property-m "@id") "schema:" "https://schema.org/")}}})) (defn- get-class-to-properties "Given a vec of class ids and a vec of properties map to process, return a map of @@ -110,7 +122,7 @@ case sensitive whereas Logseq's :block/name is case insensitive. This is dealt with by appending a '_Class' suffix to conflicting classes. If this strategy changes, be sure to update schema->logseq-data-types" - [property-ids class-ids] + [property-ids class-ids {:keys [verbose]}] (let [conflicts (->> (concat property-ids class-ids) (group-by (comp string/lower-case first)) @@ -126,26 +138,46 @@ ;; b/c schema.org doesn't use '_' in their names (map #(vector % (str % "_Class"))) (into {}))] + (if verbose + (println "Renaming the following classes because they have property names that conflict with Logseq's case insensitive :block/name:" + (keys renamed-classes) "\n") + (println "Renaming" (count renamed-classes) "classes due to page name conflicts")) + ;; Looks for all instances of a renamed class and updates them to the renamed class reference renamed-classes)) (defn- get-all-properties [schema-data {:keys [verbose]}] (let [all-properties** (filter #(= "rdf:Property" (% "@type")) schema-data) [superseded-properties all-properties*] ((juxt filter remove) #(% "schema:supersededBy") all-properties**) _ (if verbose - (println "Skipping the following superseded properties:" (mapv #(% "@id") superseded-properties)) + (println "Skipping the following superseded properties:" (mapv #(% "@id") superseded-properties) "\n") (println "Skipping" (count superseded-properties) "superseded properties")) [unsupported-properties all-properties] ((juxt filter remove) property-with-unsupported-type? all-properties*) _ (if verbose - (println "Skipping the following unsupported properties:" (mapv #(% "@id") unsupported-properties)) + (println "Skipping the following unsupported properties:" (mapv #(% "@id") unsupported-properties) "\n") (println "Skipping" (count unsupported-properties) "properties with unsupported data types"))] all-properties)) -(defn create-init-data [options] - (let [schema-data (-> (str (fs/readFileSync "resources/schemaorg-current-https.json")) - js/JSON.parse - (js->clj) - (get "@graph")) - ;; TODO: See if it's worth pulling in non-types like schema:MusicReleaseFormatType +(defn- generate-pages + [select-classes class-uuids class-to-properties property-uuids options] + (let [;; Build db-ids for all classes as they are needed for refs later, across class maps + class-db-ids (->> select-classes + (map #(vector (% "@id") (create-graph/new-db-id))) + (into {})) + pages (mapv #(hash-map :page + (->class-page % class-db-ids class-uuids class-to-properties property-uuids options)) + select-classes)] + pages)) + +(defn- generate-properties + [select-properties property-uuids class-map class-uuids options] + (apply merge + (mapv #(->property-page % (property-uuids (% "@id")) class-map class-uuids options) + select-properties))) + +(defn- get-all-classes-and-properties + "Get all classes and properties from raw json file" + [schema-data options] + (let [;; TODO: See if it's worth pulling in non-types like schema:MusicReleaseFormatType all-classes* (filter #(contains? (set (as-> (% "@type") type' (if (string? type') [type'] type'))) "rdfs:Class") @@ -153,12 +185,8 @@ all-properties* (get-all-properties schema-data options) renamed-classes (detect-id-conflicts-and-get-renamed-classes (map #(vector (% "@id") :property) all-properties*) - (map #(vector (% "@id") :class) all-classes*)) - _ (if (:verbose options) - (println "Renaming the following classes because they have property names that conflict with Logseq's case insensitive :block/name:" - (keys renamed-classes)) - (println "Renaming" (count renamed-classes) "classes due to page name conflicts")) - ;; Looks for all instances of a renamed class and updates them to the renamed class reference + (map #(vector (% "@id") :class) all-classes*) + options) rename-class-ids (fn [m] (w/postwalk (fn [x] (if-let [new-class (and (map? x) (renamed-classes (x "@id")))] @@ -167,39 +195,42 @@ ;; Updates keys like @id, @subClassOf all-classes (map rename-class-ids all-classes*) ;; Updates keys like @id, @rangeIncludes, @domainIncludes - all-properties (map rename-class-ids all-properties*) + all-properties (map rename-class-ids all-properties*)] + [all-classes all-properties])) + +(defn- create-init-data [options] + (let [schema-data (-> (str (fs/readFileSync "resources/schemaorg-current-https.json")) + js/JSON.parse + (js->clj) + (get "@graph")) + [all-classes all-properties] (get-all-classes-and-properties schema-data options) + ;; Generate data shared across pages and properties class-map (->> all-classes (map #(vector (% "@id") %)) (into {})) - property-map (->> all-properties - (map #(vector (% "@id") %)) - (into {})) select-class-ids (keys class-map) ;; Debug: Uncomment to generate a narrower graph of classes ;; select-class-ids ["schema:Person" "schema:CreativeWorkSeries" ;; "schema:Movie" "schema:CreativeWork" "schema:Thing"] - select-classes (map #(class-map %) select-class-ids) - ;; Build db-ids for all classes as they are needed for refs later, across class maps - class-db-ids (->> select-classes - (map #(vector (% "@id") (new-db-id))) - (into {})) - ;; Generate all uuids as they are needed for properties and pages + ;; Generate class uuids as they are needed for properties (:object) and pages class-uuids (->> all-classes (map #(vector (% "@id") (random-uuid))) (into {})) class-to-properties (get-class-to-properties select-class-ids all-properties) - select-properties (mapcat val class-to-properties) - ;; Build property uuids as they are needed for properties and pages + select-properties (set (mapcat val class-to-properties)) + ;; Generate property uuids as they are needed for properties and pages (:schema properties) property-uuids (->> select-properties (map #(vector % (random-uuid))) (into {})) - properties-config (apply merge - (mapv #(->property-page (property-map %) (property-uuids %) class-map class-uuids) - select-properties)) - pages (mapv #(hash-map :page (->class-page % class-db-ids class-uuids class-to-properties property-uuids options)) - select-classes)] + ;; Generate pages and properties + properties (generate-properties + (filter #(contains? select-properties (% "@id")) all-properties) + property-uuids class-map class-uuids options) + pages (generate-pages + (map #(class-map %) select-class-ids) + class-uuids class-to-properties property-uuids options)] {:pages-and-blocks pages - :properties properties-config})) + :properties properties})) (def spec "Options spec" @@ -225,6 +256,7 @@ (count (:pages-and-blocks init-data)) " classes and " (count (:properties init-data)) " properties ...")) (d/transact! conn blocks-tx) + (when (:verbose options) (println "Transacted" (count (d/datoms @conn :eavt)) "datoms")) (println "Created graph" (str db-name "!")))) (when (= nbb/*file* (:file (meta #'-main)))