跳到主要内容

使用 JSON 类型字段

JSON 全称为 JavaScript Object Notation,是一种轻量级且易于使用的文本数据格式。本节将帮助您了解如何使用 JSON 类型的字段,包括插入 JSON 值,使用简单和高级操作符在 JSON 字段中进行标量过滤等。

概述

在 Collection 中,一个 JSON 字段由键值对组成,其中每个键是一个字符串,其相应的值可以是数字、字符串、布尔值、列表或数组。Zilliz Cloud 支持将字段以字典的形式插入到集群的 Collection 中。

以下示例代码展示如何随机生成键值对,并在每个键值对中包含一个名为 color 的字典:

# 3. Insert randomly generated vectors 
colors = ["green", "blue", "yellow", "red", "black", "white", "purple", "pink", "orange", "brown", "grey"]
data = []

for i in range(1000):
current_color = random.choice(colors)
current_tag = random.randint(1000, 9999)
current_coord = [ random.randint(0, 40) for _ in range(3) ]
current_ref = [ [ random.choice(colors) for _ in range(3) ] for _ in range(3) ]
data.append({
"id": i,
"vector": [ random.uniform(-1, 1) for _ in range(5) ],
"color": {
"label": current_color,
"tag": current_tag,
"coord": current_coord,
"ref": current_ref
}
})

print(data[0])

您可以查看随机生成的数据集中的第一条记录了解数据结构。

{
"id": 0,
"vector": [
-0.8017921296923975,
0.550046715206634,
0.764922589768134,
0.6371433836123146,
0.2705233937454232
],
"color": {
"label": "blue",
"tag": 9927,
"coord": [
22,
36,
6
],
"ref": [
[
"blue",
"green",
"white"
],
[
"black",
"green",
"pink"
],
[
"grey",
"black",
"brown"
]
]
}
}
📘说明

在一个 JSON 字段内的各键值对中,

  • 键名可包含字母、数字或下划线(_)。

  • 如果值为列表或数组,其元素数据类型需保持一致。

  • 如果值为 JSON,会被解析成字符串。

定义 JSON 字段

定义 JSON 字段的过程与定义其他类型字段的过程相同。

import random, time
from pymilvus import connections, MilvusClient, DataType

CLUSTER_ENDPOINT = "YOUR_CLUSTER_ENDPOINT"
TOKEN = "YOUR_CLUSTER_TOKEN"

# 1. Set up a Milvus client
client = MilvusClient(
uri=CLUSTER_ENDPOINT,
token=TOKEN
)

# 2. Create a collection
schema = MilvusClient.create_schema(
auto_id=False,
enable_dynamic_field=False,
)

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=5)
# highlight-next-line
schema.add_field(field_name="color", datatype=DataType.JSON)

index_params = MilvusClient.prepare_index_params()

index_params.add_index(
field_name="id",
index_type="STL_SORT"
)

index_params.add_index(
field_name="vector",
index_type="IVF_FLAT",
metric_type="L2",
params={"nlist": 1024}
)

client.create_collection(
collection_name="test_collection",
schema=schema,
index_params=index_params
)

res = client.get_load_state(
collection_name="test_collection"
)

print(res)

# Output
#
# {
# "state": "<LoadState: Loaded>"
# }

插入字段值

Collection 创建完成后,就可以向 Collection 中插入概述中随机生成的字典了。

res = client.insert(
collection_name="test_collection",
data=data
)

print(res)

# Output
#
# {
# "insert_count": 1000,
# "ids": [
# 0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# "(990 more items hidden)"
# ]
# }

简单标量过滤

所有数据插入完成后,您可以使用 JSON 字段中的键进行搜索,搜索方法与基于标量字段搜索相同。

# 4. Basic search with a JSON field
query_vectors = [ [ random.uniform(-1, 1) for _ in range(5) ]]

res = client.search(
collection_name="test_collection",
data=query_vectors,
# highlight-next-line
filter='color["label"] in ["red"]',
search_params={
"metric_type": "L2",
"params": {"nprobe": 16}
},
output_fields=["id", "color"],
limit=3
)

print(res)

# Output
#
# [
# [
# {
# "id": 460,
# "distance": 0.4016231596469879,
# "entity": {
# "id": 460,
# "color": {
# "label": "red",
# "tag": 5030,
# "coord": [14, 32, 40],
# "ref": [
# [ "pink", "green", "brown" ],
# [ "red", "grey", "black"],
# [ "red", "yellow", "orange"]
# ]
# }
# }
# },
# {
# "id": 785,
# "distance": 0.451080858707428,
# "entity": {
# "id": 785,
# "color": {
# "label": "red",
# "tag": 5290,
# "coord": [31, 13, 23],
# "ref": [
# ["yellow", "pink", "pink"],
# ["purple", "grey", "orange"],
# ["grey", "purple", "pink"]
# ]
# }
# }
# },
# {
# "id": 355,
# "distance": 0.5839247703552246,
# "entity": {
# "id": 355,
# "color": {
# "label": "red",
# "tag": 8725,
# "coord": [5, 10, 22],
# "ref": [
# ["white", "purple", "yellow"],
# ["white", "purple", "white"],
# ["orange", "white", "pink"]
# ]
# }
# }
# }
# ]
# ]

高级标量过滤

针对 JSON 类型的字段,Zilliz Cloud 提供了一系列的高级过滤器,包括JSON_CONTAINSJSON_CONTAINS_ALLJSON_CONTAINS_ANY

  • 过滤出所有以 ["blue", "brown", "grey"] 为参考色彩集的 Entity。

    # 5. Advanced search within a JSON field

    res = client.query(
    collection_name="test_collection",
    data=query_vectors,
    filter='JSON_CONTAINS(color["ref"], ["blue", "brown", "grey"])',
    output_fields=["id", "color"],
    limit=3
    )

    print(res)

    # Output
    #
    # [
    # {
    # "id": 79,
    # "color": {
    # "label": "orange",
    # "tag": 8857,
    # "coord": [
    # 10,
    # 14,
    # 5
    # ],
    # "ref": [
    # [
    # "yellow",
    # "white",
    # "green"
    # ],
    # [
    # "blue",
    # "purple",
    # "purple"
    # ],
    # [
    # "blue",
    # "brown",
    # "grey"
    # ]
    # ]
    # }
    # },
    # {
    # "id": 371,
    # "color": {
    # "label": "black",
    # "tag": 1324,
    # "coord": [
    # 2,
    # 18,
    # 32
    # ],
    # "ref": [
    # [
    # "purple",
    # "orange",
    # "brown"
    # ],
    # [
    # "blue",
    # "brown",
    # "grey"
    # ],
    # [
    # "purple",
    # "blue",
    # "blue"
    # ]
    # ]
    # }
    # },
    # {
    # "id": 590,
    # "color": {
    # "label": "red",
    # "tag": 3340,
    # "coord": [
    # 13,
    # 21,
    # 13
    # ],
    # "ref": [
    # [
    # "yellow",
    # "yellow",
    # "red"
    # ],
    # [
    # "blue",
    # "brown",
    # "grey"
    # ],
    # [
    # "pink",
    # "yellow",
    # "purple"
    # ]
    # ]
    # }
    # }
    # ]
  • 过滤出所有色彩集标定包含 4 和 5 的 Entity。

    res = client.query(
    collection_name="test_collection",
    data=query_vectors,
    filter='JSON_CONTAINS_ALL(color["coord"], [4, 5])',
    output_fields=["id", "color"],
    limit=3
    )

    print(res)

    # Output
    #
    # [
    # {
    # "id": 281,
    # "color": {
    # "label": "red",
    # "tag": 3645,
    # "coord": [
    # 5,
    # 33,
    # 4
    # ],
    # "ref": [
    # [
    # "orange",
    # "blue",
    # "pink"
    # ],
    # [
    # "purple",
    # "blue",
    # "purple"
    # ],
    # [
    # "black",
    # "brown",
    # "yellow"
    # ]
    # ]
    # }
    # },
    # {
    # "id": 464,
    # "color": {
    # "label": "brown",
    # "tag": 6261,
    # "coord": [
    # 5,
    # 9,
    # 4
    # ],
    # "ref": [
    # [
    # "purple",
    # "purple",
    # "brown"
    # ],
    # [
    # "black",
    # "pink",
    # "white"
    # ],
    # [
    # "brown",
    # "grey",
    # "brown"
    # ]
    # ]
    # }
    # },
    # {
    # "id": 567,
    # "color": {
    # "label": "green",
    # "tag": 4589,
    # "coord": [
    # 5,
    # 39,
    # 4
    # ],
    # "ref": [
    # [
    # "purple",
    # "yellow",
    # "white"
    # ],
    # [
    # "yellow",
    # "yellow",
    # "brown"
    # ],
    # [
    # "blue",
    # "red",
    # "yellow"
    # ]
    # ]
    # }
    # }
    # ]
  • 过滤出所有色彩集标定包含 4 或 5 的 Entity。

    res = client.query(
    collection_name="test_collection",
    data=query_vectors,
    filter='JSON_CONTAINS_ANY(color["coord"], [4, 5])',
    output_fields=["id", "color"],
    limit=3
    )

    print(res)

    # Output
    #
    # [
    # {
    # "id": 0,
    # "color": {
    # "label": "yellow",
    # "tag": 6340,
    # "coord": [
    # 40,
    # 4,
    # 40
    # ],
    # "ref": [
    # [
    # "purple",
    # "yellow",
    # "orange"
    # ],
    # [
    # "green",
    # "grey",
    # "purple"
    # ],
    # [
    # "black",
    # "white",
    # "yellow"
    # ]
    # ]
    # }
    # },
    # {
    # "id": 2,
    # "color": {
    # "label": "brown",
    # "tag": 9359,
    # "coord": [
    # 38,
    # 21,
    # 5
    # ],
    # "ref": [
    # [
    # "red",
    # "brown",
    # "white"
    # ],
    # [
    # "purple",
    # "red",
    # "brown"
    # ],
    # [
    # "pink",
    # "grey",
    # "black"
    # ]
    # ]
    # }
    # },
    # {
    # "id": 7,
    # "color": {
    # "label": "green",
    # "tag": 3560,
    # "coord": [
    # 5,
    # 9,
    # 5
    # ],
    # "ref": [
    # [
    # "blue",
    # "orange",
    # "green"
    # ],
    # [
    # "blue",
    # "blue",
    # "black"
    # ],
    # [
    # "green",
    # "purple",
    # "green"
    # ]
    # ]
    # }
    # }
    # ]

使用 JSON 字段作为过滤条件

在使用 JSON 字段时,你可以使用 JSON 字段本身或其包含的任意键值对进行标量过滤。

📘说明
  • 在存储 JSON 字段时,Zilliz Cloud 不会对其内容进行转义。

例如,'a"b'"a'b"'a\'b'"a\"b" 在存储时会保持原样,而 'a'b'"a"b" 则会被认为是非法值。

  • 在使用 JSON 字段作为过滤条件时,可以使用该字段中的键名。

  • 如果某键名对应的值为整数或浮点数时,可以将其与另一个整数或浮点数进行比较,也可以将其与一个类型为 INT32/64 或 FLOAT32/64 的字段值进行比较。

  • 如果某键名对应的值为字符串时,可以将其与另一个字符串或类型为 VARCHAR 的字段进行比较。

简单操作符

假设 JSON 字段具有 A 键。使用 JSON 字段构建布尔表达式时,请参考以下表格。

操作符

示例

备注

<

"A < 3"

A 必须存在

>

"A > 1"

A 必须存在

==

"A == 1""A == 'abc'"

A 必须存在

!=

"A != 1""A != 'abc'"

A 可以不存在

<=

"A <= 5"

A 必须存在

>=

"A >= 1"

A 必须存在

not

"not A == 1""not A != 'abc'"

A 可以不存在

in

"A in [1, 2, 3]""A in ['a', 'b', 'c']"

A 必须存在

add (&&)

"A > 1 && A < 3"

A 是否必须存在取决于运算符两侧表达式的要求

or (||)

"A > 1 || A < 3"

A 是否必须存在取决于运算符两侧表达式的要求

exist

"exist A"

A 必须存在

高级操作符

针对 JSON 类型的字段,Zilliz Cloud 提供了一系列的高级过滤器,包括JSON_CONTAINSJSON_CONTAINS_ALLJSON_CONTAINS_ANY

  • json_contains(identifier, jsonExpr)

    该操作符可用于过滤包含指定表达式的 Entity。

    【例 1】{"x": [1,2,3]}

    json_contains(x, 1) # => True (x 包含 1。)
    json_contains(x, "a") # => False (x 没有包含值为 "a" 的元素。)

    【例 2】 {"x", [[1,2,3], [4,5,6], [7,8,9]]}

    json_contains(x, [1,2,3]) # => True (x 包含 [1,2,3]。)
    json_contains(x, [3,2,1]) # => False (x 没有包含值为 [3,2,1] 的元素。)
  • json_contains_all(identifier, jsonExpr)

    该操作符可用于过滤包含指定表达式中所有元素的 Entity。

    【例】 {"x": [1,2,3,4,5,7,8]}

    json_contains_all(x, [1,2,8]) # => True (x 包含 1、2 和 8。)
    json_contains_all(x, [4,5,6]) # => False (x 不包含 6。)
  • json_contains_any(identifier, jsonExpr)

    该操作符可用于过滤包含指定表达式中任意元素的 Entity。

    【例】 {"x": [1,2,3,4,5,7,8]}

    json_contains_any(x, [1,2,8]) # => True (x 包含 1、2 和 8。)
    json_contains_any(x, [4,5,6]) # => True (x 包含 4 和 5。)
    json_contains_any(x, [6,9]) # => False (x 不包含 6 或 9。)