Skip to content

Reference

GraphFormatConfig

Bases: BaseModel

Graph formatting configuration class

This class stores configuration of how to convert tabular data into RDF graph data. Some examples of sample configurations can be found in the examples/ directory.

Attributes:

Name Type Description
source_name str

A string that is used to describe the source (i.e. "wikipedia" for data from wikipedia)

predicate_mapping Dict[str, PredicateMapping]

A dictionary that maps the column_name to a predicate in URI form

primary_key str

The primary key of the row (usually something like id) that will the subject of every quad

subject_namespace Optional[Namespace]

A string prepended to the quad's subject as a namespace, instead of just using the value of the primary_key.

graph_namespace Optional[Namespace]

Similar to subject_namespace in that this will assign each fact to a named graph with the graph_namespace.

date_field Optional[str]

The column in your dataset that the fact's "date" will be pulled from. When specified, the named graph field in each fact will be build from the date.

Source code in quadipy/schemas/graph_format_config.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
class GraphFormatConfig(BaseModel):
    """Graph formatting configuration class

    This class stores configuration of how to convert tabular data into RDF graph data.
    Some examples of sample configurations can be found in the `examples/` directory.

    Attributes:
        source_name: A string that is used to describe the source (i.e. "wikipedia" for data from wikipedia)
        predicate_mapping: A dictionary that maps the column_name to a predicate in URI form
        primary_key: The primary key of the row (usually something like `id`) that will the subject of every quad
        subject_namespace: A string prepended to the quad's subject as a namespace, instead of just using the value of the `primary_key`.
        graph_namespace: Similar to `subject_namespace` in that this will assign each fact to a named graph with the `graph_namespace`.
        date_field: The column in your dataset that the fact's "date" will be pulled from. When specified, the named graph field in each fact will be build from the date.
    """

    source_name: str
    predicate_mapping: Dict[str, PredicateMapping]
    primary_key: str
    subject_namespace: Optional[Namespace]
    graph_namespace: Optional[Namespace]
    date_field: Optional[str]

    class Config:
        """Pydantic config class"""

        arbitrary_types_allowed = True
        allow_mutation = False
        extra = Extra.allow

    @validator("subject_namespace")
    @classmethod
    def _validate_subject_namespace(cls, value: Optional[str]) -> Optional[Namespace]:
        return format_namespace(value)

    @validator("graph_namespace")
    @classmethod
    def _validate_graph_namespace(cls, value: Optional[str]) -> Optional[Namespace]:
        return format_namespace(value)

    def subject(self, record: Dict) -> URIRef:
        """Creates URI out of primary key of the record

        Args:
            record: A dictonary that contains the data. Must contain the `primary_key` specified in the class

        Returns:
            An rdflib.URIRef with the value of the primary_key value in the record

        Raises:
            AssertionError: The `record` provided lacked the `primary_key` field
        """
        assert (
            self.primary_key in record
        ), f"{self.primary_key} isn't defined in {record}! Each record must have a defined primary key"
        primary_key = str(record[self.primary_key])
        if self.subject_namespace:
            return self.subject_namespace[primary_key]
        return URIRef(primary_key)

    def obj(self, record: Dict, col_name: str) -> Optional[Union[Literal, URIRef]]:
        value = record.get(col_name)
        if value is not None:
            predicate_mapping = self.predicate_mapping[col_name]
            obj_value = predicate_mapping.obj_datatype.value(value)
            return self.add_namespace_to_obj(col_name, obj_value)
        return None

    def add_namespace_to_obj(
        self, col_name: str, obj_value: Union[Literal, URIRef]
    ) -> Union[Literal, URIRef]:
        predicate_mapping = self.predicate_mapping[col_name]
        if predicate_mapping.obj_namespace:
            return predicate_mapping.obj_namespace[obj_value]
        return obj_value

    def named_graph(self, record: Dict) -> Optional[URIRef]:
        if self.date_field:
            graph = self.build_graph_from_date(record)
            return URIRef(graph.strip("/"))
        if self.graph_namespace:
            graph = URIRef(self.graph_namespace)
            return URIRef(graph.strip("/"))
        return None

    def _validate_record_has_date_field(self, record: Dict) -> None:
        assert (
            self.date_field in record
        ), f"{self.date_field} must be defined in {record} in order to build record from date"

    def validate_date_field_is_valid_format(self, record: Dict) -> str:
        self._validate_record_has_date_field(record)
        date_value = record[self.date_field]
        try:
            if isinstance(date_value, date):
                dt = date_value
            else:
                dt = datetime.fromisoformat(date_value)
            return dt.strftime("%Y-%m-%d")
        except ValueError as exc:
            raise ValueError(f"{date_value} isn't a valid date!") from exc

    def build_graph_from_date(self, record: Dict) -> URIRef:
        """Builds named graph URI from a date field in the record

        If you have a field like `created_at` or `updated_at` and want to store that metadata in the named graph
        field, this method will build the named graph URI from the `date_field`

        Examples:
            record = {"created_at": "2022-01-01"} -> URIRef("2022-01-01")

            If the config has a `graph_namespace` defined this would change to
            self.graph_namespace = "graph://wikipedia.org/"

            record = {"created_at": "2022-01-01"} -> URIRef("graph://wikipedia.org/2022-01-01")

        Args:
            record: A dictonary that contains the data

        Returns:
            a rdflib.URIRef of the named graph built from the `date_field`

        Raises:
            ParserError: If the value in the `date_field` isn't a valid date
            AssertionError: If the `record` provided doesn't have the `date_field` defined
        """
        date_value = self.validate_date_field_is_valid_format(record)
        if self.graph_namespace:
            return self.graph_namespace[date_value]
        return URIRef(date_value)

    def map_predicate_mapping_to_quad(
        self, col_name: str, predicate: URIRef, record: Dict
    ) -> Optional[Quad]:
        obj = self.obj(record, col_name)
        if not obj:
            return None
        subject = self.subject(record)
        graph = self.named_graph(record)
        return Quad.from_tuple((subject, predicate, obj, graph))

    def process_quad_list(
        self, value: str, predicate_uri: URIRef, record: Dict
    ) -> List[Quad]:
        quads = []
        subject = self.subject(record)
        graph = self.named_graph(record)
        try:
            val_list = json.loads(value)
            for item in val_list:
                quad = Quad.from_tuple((subject, predicate_uri, Literal(item), graph))
                quads.append(quad)
        except SyntaxError:
            logging.info(f"Can't load list with value: {value}")
        except json.decoder.JSONDecodeError:
            quad = Quad.from_tuple((subject, predicate_uri, Literal(value), graph))
            quads.append(quad)
        return quads

    def quadify(self, record: Dict) -> List[Quad]:
        """Takes a record and translates into a list of Quads

        This process is explained more in-depth in the README but this is the high level method that translates
        a record into a list of Quads that can be inserted in an RDF graph

        Args:
            record: A dictionary that contains the data to be quadified

        Returns:
            A list of Quads
        """
        quads = []
        for col_name, predicate in self.predicate_mapping.items():
            value = record.get(col_name)
            if isinstance(value, str) and value and value[0] == "[":
                quads.extend(
                    self.process_quad_list(value, predicate.predicate_uri, record)
                )
            else:
                quad = self.map_predicate_mapping_to_quad(
                    col_name, predicate.predicate_uri, record
                )
                if quad:
                    quads.append(quad)
        return quads

Config

Pydantic config class

Source code in quadipy/schemas/graph_format_config.py
36
37
38
39
40
41
class Config:
    """Pydantic config class"""

    arbitrary_types_allowed = True
    allow_mutation = False
    extra = Extra.allow

build_graph_from_date(record)

Builds named graph URI from a date field in the record

If you have a field like created_at or updated_at and want to store that metadata in the named graph field, this method will build the named graph URI from the date_field

Examples:

record = {"created_at": "2022-01-01"} -> URIRef("2022-01-01")

If the config has a graph_namespace defined this would change to self.graph_namespace = "graph://wikipedia.org/"

record = {"created_at": "2022-01-01"} -> URIRef("graph://wikipedia.org/2022-01-01")

Parameters:

Name Type Description Default
record Dict

A dictonary that contains the data

required

Returns:

Type Description
URIRef

a rdflib.URIRef of the named graph built from the date_field

Raises:

Type Description
ParserError

If the value in the date_field isn't a valid date

AssertionError

If the record provided doesn't have the date_field defined

Source code in quadipy/schemas/graph_format_config.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def build_graph_from_date(self, record: Dict) -> URIRef:
    """Builds named graph URI from a date field in the record

    If you have a field like `created_at` or `updated_at` and want to store that metadata in the named graph
    field, this method will build the named graph URI from the `date_field`

    Examples:
        record = {"created_at": "2022-01-01"} -> URIRef("2022-01-01")

        If the config has a `graph_namespace` defined this would change to
        self.graph_namespace = "graph://wikipedia.org/"

        record = {"created_at": "2022-01-01"} -> URIRef("graph://wikipedia.org/2022-01-01")

    Args:
        record: A dictonary that contains the data

    Returns:
        a rdflib.URIRef of the named graph built from the `date_field`

    Raises:
        ParserError: If the value in the `date_field` isn't a valid date
        AssertionError: If the `record` provided doesn't have the `date_field` defined
    """
    date_value = self.validate_date_field_is_valid_format(record)
    if self.graph_namespace:
        return self.graph_namespace[date_value]
    return URIRef(date_value)

quadify(record)

Takes a record and translates into a list of Quads

This process is explained more in-depth in the README but this is the high level method that translates a record into a list of Quads that can be inserted in an RDF graph

Parameters:

Name Type Description Default
record Dict

A dictionary that contains the data to be quadified

required

Returns:

Type Description
List[Quad]

A list of Quads

Source code in quadipy/schemas/graph_format_config.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def quadify(self, record: Dict) -> List[Quad]:
    """Takes a record and translates into a list of Quads

    This process is explained more in-depth in the README but this is the high level method that translates
    a record into a list of Quads that can be inserted in an RDF graph

    Args:
        record: A dictionary that contains the data to be quadified

    Returns:
        A list of Quads
    """
    quads = []
    for col_name, predicate in self.predicate_mapping.items():
        value = record.get(col_name)
        if isinstance(value, str) and value and value[0] == "[":
            quads.extend(
                self.process_quad_list(value, predicate.predicate_uri, record)
            )
        else:
            quad = self.map_predicate_mapping_to_quad(
                col_name, predicate.predicate_uri, record
            )
            if quad:
                quads.append(quad)
    return quads

subject(record)

Creates URI out of primary key of the record

Parameters:

Name Type Description Default
record Dict

A dictonary that contains the data. Must contain the primary_key specified in the class

required

Returns:

Type Description
URIRef

An rdflib.URIRef with the value of the primary_key value in the record

Raises:

Type Description
AssertionError

The record provided lacked the primary_key field

Source code in quadipy/schemas/graph_format_config.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def subject(self, record: Dict) -> URIRef:
    """Creates URI out of primary key of the record

    Args:
        record: A dictonary that contains the data. Must contain the `primary_key` specified in the class

    Returns:
        An rdflib.URIRef with the value of the primary_key value in the record

    Raises:
        AssertionError: The `record` provided lacked the `primary_key` field
    """
    assert (
        self.primary_key in record
    ), f"{self.primary_key} isn't defined in {record}! Each record must have a defined primary key"
    primary_key = str(record[self.primary_key])
    if self.subject_namespace:
        return self.subject_namespace[primary_key]
    return URIRef(primary_key)

PredicateMapping

Bases: BaseModel

Source code in quadipy/schemas/predicate_mapping.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class PredicateMapping(BaseModel):
    predicate_uri: URIRef
    obj_datatype: ObjectDataTypes = ObjectDataTypes.literal
    obj_namespace: Optional[Namespace]
    """Class to define relationship between data and predicate

    predicate_uri: The URI the data will be mapped to
    obj_datatype: Datatype that the object will be serialized to defaults to literal but can be one of (literal, date, uri)
    obj_namespace: If the object value should be mapped to a specific namespace
    """

    class Config:
        """Pydantic config class"""

        arbitrary_types_allowed = True
        allow_mutation = False

    @validator("predicate_uri")
    @classmethod
    def _predicate_mapping_serialized_as_uri(cls, predicate_uri: Any) -> URIRef:
        assert _is_valid_uri(predicate_uri), f"{predicate_uri} is not a valid uri"
        return URIRef(predicate_uri)

    @validator("obj_namespace")
    @classmethod
    def _validate_obj_namespace(cls, value: Optional[str]) -> Optional[Namespace]:
        return format_namespace(value)

obj_namespace: Optional[Namespace] class-attribute

Class to define relationship between data and predicate

predicate_uri: The URI the data will be mapped to obj_datatype: Datatype that the object will be serialized to defaults to literal but can be one of (literal, date, uri) obj_namespace: If the object value should be mapped to a specific namespace

Config

Pydantic config class

Source code in quadipy/schemas/predicate_mapping.py
42
43
44
45
46
class Config:
    """Pydantic config class"""

    arbitrary_types_allowed = True
    allow_mutation = False

Quad

Bases: BaseModel

Base RDF Fact class.

Each RDF fact is modeled as subject predicate object with an optional 4th term to specify the named graph. This is sometimes referred to as SPOG.

Attributes:

Name Type Description
subject Union[BNode, URIRef]

Must be a blank node or uri

predicate URIRef

Must be a uri

obj Union[URIRef, Literal]

We use obj to not use the built-in object keyword in python. Must be a blank node, uri, or literal

graph Optional[URIRef]

An optional argument that can be used to specify the named graph the fact will belong to. Must be an uri

Source code in quadipy/schemas/quad.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class Quad(BaseModel):
    """Base RDF Fact class.

    Each RDF fact is modeled as subject predicate object with an optional 4th term to specify the named graph.
    This is sometimes referred to as SPOG.

    Attributes:
        subject: Must be a blank node or uri
        predicate: Must be a uri
        obj: We use obj to not use the built-in object keyword in python. Must be a blank node, uri, or literal
        graph: An optional argument that can be used to specify the named graph the fact will belong to. Must be an uri
    """

    subject: Union[BNode, URIRef]
    predicate: URIRef
    obj: Union[URIRef, Literal]
    graph: Optional[URIRef]

    class Config:
        """Pydantic config class"""

        arbitrary_types_allowed = True

    @validator("subject")
    @classmethod
    def _validate_subject(cls, subject: Any) -> Union[BNode, URIRef]:
        return cls._validate_value_type(subject, (BNode, URIRef))  # type: ignore

    @validator("predicate")
    @classmethod
    def _validate_predicate(cls, predicate: Any) -> URIRef:
        return cls._validate_value_type(predicate, (URIRef,))  # type: ignore

    @validator("obj")
    @classmethod
    def _validate_obj(cls, obj: Any) -> Union[URIRef, Literal]:
        return cls._validate_value_type(obj, (URIRef, Literal))  # type: ignore

    @validator("graph")
    @classmethod
    def _validate_graph(cls, graph: Any) -> Any:
        if graph:
            return cls._validate_value_type(graph, (BNode, URIRef, Literal))
        return None

    @classmethod
    def from_tuple(cls, tup: Tuple) -> Quad:
        if len(tup) == 4:
            subject, predicate, obj, graph = tup
            return cls(subject=subject, predicate=predicate, obj=obj, graph=graph)
        if len(tup) == 3:
            subject, predicate, obj = tup
            return cls(subject=subject, predicate=predicate, obj=obj)
        raise ValueError(f"tuple must be of size 3,4 to be quadified: {tup}")

    def to_tuple(self) -> Tuple:
        """Converts quad to a tuple. This method is useful for adding Quad to rdflib Graphs"""
        if self.graph:
            return (self.subject, self.predicate, self.obj, self.graph)
        return (self.subject, self.predicate, self.obj)

    @staticmethod
    def _validate_value_type(value: Any, types: Tuple[Type, ...]) -> Any:
        """Validates the value are the correct data type

        This is used to validate that subject, predicate, objects, and named graph types are the
        correct data type

        Args:
            value: the value to validate

        Returns:
            the value that was validated if it is the correct datatype

        Raises:
            TypeError: If the value isn't the correct datatype
        """
        if isinstance(value, types):
            return value
        raise TypeError(f"subject must be of type {types} and not {type(value)}")

Config

Pydantic config class

Source code in quadipy/schemas/quad.py
27
28
29
30
class Config:
    """Pydantic config class"""

    arbitrary_types_allowed = True

to_tuple()

Converts quad to a tuple. This method is useful for adding Quad to rdflib Graphs

Source code in quadipy/schemas/quad.py
64
65
66
67
68
def to_tuple(self) -> Tuple:
    """Converts quad to a tuple. This method is useful for adding Quad to rdflib Graphs"""
    if self.graph:
        return (self.subject, self.predicate, self.obj, self.graph)
    return (self.subject, self.predicate, self.obj)