跳到主要内容

使用 Array 类型字段

本节将帮助您了解如何使用 Array 类型的字段,包括插入数组,使用简单和高级操作符在数组字段中进行标量过滤等。

概述

Zilliz Cloud 支持在 Collection 中使用 Array 类型的字段。如果需要在 Collection 中添加一个 Array 类型的字段,还需要设置该字段包含的元素类型和最大元素数量。这就意味着 Collection 中 Array 类型的字段中各元素的数据类型须保持一致。

如下代码随机生成一组数据,其中包含一个数组字段。

colors = ["green", "blue", "yellow", "red", "black", "white", "purple", "pink", "orange", "brown", "grey"]
data = []

for i in range(1000):
current_color = random.choice(colors)
current_tag = random.randint(1000, 9999)
current_coord = [ random.randint(0, 40) for _ in range(random.randint(3, 5)) ]
data.append({
"id": i,
"vector": [ random.uniform(-1, 1) for _ in range(5) ],
"color": current_color,
"color_tag": current_tag,
"color_coord": current_coord,
})

print(data[0])

您可以通过查看随机生成的数据中的第一条记录来了解随机生成的数据结构。

{
id: 0,
vector: [
0.0338537420906162,
0.6844108238358322,
0.28410588909961754,
0.09752595400212116,
0.22671013058761114
],
color: 'orange',
color_tag: 5677,
color_coord: [ 3, 0, 18, 29 ]
}
📘说明
  • 数组字段中的元素数据类型(Data Type)须保持一致。

  • 数组字段中包含的元素数量须小于定义该字段时指定的最大容量(Max Capacity)。

定义 Array 字段

定义数组字段的过程与定义其他类型字段的过程相同。

import random, time
from pymilvus import MilvusClient, DataType

CLUSTER_ENDPOINT = "YOUR_CLUSTER_ENDPOINT"
TOKEN = "YOUR_CLUSTER_TOKEN"

# 1. Set up a Milvus client
client = MilvusClient(
uri=CLUSTER_ENDPOINT,
token=TOKEN
)

# 2. Create a collection
schema = MilvusClient.create_schema(
auto_id=False,
enable_dynamic_field=False,
)

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=5)
schema.add_field(field_name="color", datatype=DataType.VARCHAR, max_length=512)
schema.add_field(field_name="color_tag", datatype=DataType.INT64)
# highlight-next-line
schema.add_field(field_name="color_coord", datatype=DataType.ARRAY, element_type=DataType.INT64, max_capacity=5)

index_params = MilvusClient.prepare_index_params()

index_params.add_index(
field_name="id",
index_type="STL_SORT"
)

index_params.add_index(
field_name="vector",
index_type="IVF_FLAT",
metric_type="L2",
params={"nlist": 1024}
)

client.create_collection(
collection_name="test_collection",
schema=schema,
index_params=index_params
)

res = client.get_load_state(
collection_name="test_collection"
)

print(res)

# Output
#
# {
# "state": "<LoadState: Loaded>"
# }

插入字段值

Collection 创建完毕后,就可以向 Collection 中插入概述中随机生成的数据了。

res = client.insert(
collection_name="test_collection",
data=data
)

print(res)

# Output
#
# {
# "insert_count": 1000,
# "ids": [
# 0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# "(990 more items hidden)"
# ]
# }

time.sleep(5)

简单标量过滤

在插入所有数据后,就可以像使用其它类型的标量字段一样使用数组中的元素进行检索和查询了。

# 4. Basic search with the array field
query_vectors = [ [ random.uniform(-1, 1) for _ in range(5) ]]

res = client.search(
collection_name="test_collection",
data=query_vectors,
filter="color_coord[0] < 10",
search_params={
"metric_type": "L2",
"params": {"nprobe": 16}
},
output_fields=["id", "color", "color_tag", "color_coord"],
limit=3
)

print(res)

# Output
#
# [
# [
# {
# "id": 993,
# "distance": 0.1538649946451187,
# "entity": {
# "color_coord": [
# 5,
# 37,
# 39,
# 18
# ],
# "id": 993,
# "color": "black",
# "color_tag": 6785
# }
# },
# {
# "id": 452,
# "distance": 0.2353954315185547,
# "entity": {
# "color_coord": [
# 2,
# 27,
# 34,
# 32,
# 30
# ],
# "id": 452,
# "color": "brown",
# "color_tag": 2075
# }
# },
# {
# "id": 862,
# "distance": 0.27913951873779297,
# "entity": {
# "color_coord": [
# 0,
# 19,
# 0,
# 26
# ],
# "id": 862,
# "color": "brown",
# "color_tag": 1787
# }
# }
# ]
# ]

高级标量过滤

和 JSON 字段一样,Zilliz Cloud 也针对 Array 字段提供了一系列高级过滤器,包括ARRAY_CONTAINSARRAY_CONTAINS_ALLARRAY_CONTAINS_ANYARRAY_LENGTH

  • 过滤出所有色彩集标定包含 10 的 Entity。

    # 5. Advanced search within the array field

    res = client.query(
    collection_name="test_collection",
    filter="ARRAY_CONTAINS(color_coord, 10)",
    output_fields=["id", "color", "color_tag", "color_coord"],
    limit=3
    )

    print(res)

    # Output
    #
    # [
    # {
    # "id": 21,
    # "color": "white",
    # "color_tag": 4202,
    # "color_coord": [
    # 10,
    # 5,
    # 5
    # ]
    # },
    # {
    # "id": 31,
    # "color": "grey",
    # "color_tag": 7386,
    # "color_coord": [
    # 8,
    # 10,
    # 23,
    # 7,
    # 31
    # ]
    # },
    # {
    # "id": 45,
    # "color": "purple",
    # "color_tag": 6126,
    # "color_coord": [
    # 0,
    # 10,
    # 24
    # ]
    # }
    # ]
  • 过滤出所有色彩集标定包含 78 的 Entity。

    res = client.query(
    collection_name="test_collection",
    filter="ARRAY_CONTAINS_ALL(color_coord, [7, 8])",
    output_fields=["id", "color", "color_tag", "color_coord"],
    limit=3
    )

    print(res)

    # Output
    #
    # [
    # {
    # "color": "grey",
    # "color_tag": 7386,
    # "color_coord": [
    # 8,
    # 10,
    # 23,
    # 7,
    # 31
    # ],
    # "id": 31
    # },
    # {
    # "color": "purple",
    # "color_tag": 7823,
    # "color_coord": [
    # 38,
    # 8,
    # 36,
    # 38,
    # 7
    # ],
    # "id": 258
    # },
    # {
    # "color": "purple",
    # "color_tag": 6356,
    # "color_coord": [
    # 34,
    # 32,
    # 11,
    # 8,
    # 7
    # ],
    # "id": 348
    # }
    # ]
  • 过滤出所有色彩集标定包含 789 的 Entity。

    res = client.query(
    collection_name="test_collection",
    filter="ARRAY_CONTAINS_ANY(color_coord, [7, 8, 9])",
    output_fields=["id", "color", "color_tag", "color_coord"],
    limit=3
    )

    print(res)

    # Output
    #
    # [
    # {
    # "id": 0,
    # "color": "green",
    # "color_tag": 9212,
    # "color_coord": [
    # 37,
    # 36,
    # 36,
    # 7,
    # 9
    # ]
    # },
    # {
    # "id": 5,
    # "color": "blue",
    # "color_tag": 9643,
    # "color_coord": [
    # 8,
    # 20,
    # 20,
    # 11
    # ]
    # },
    # {
    # "id": 12,
    # "color": "blue",
    # "color_tag": 3075,
    # "color_coord": [
    # 29,
    # 7,
    # 17
    # ]
    # }
    # ]
  • 过滤出所有色彩集标定包含 4 个元素的 Entity。

    res = client.query(
    collection_name="test_collection",
    filter="ARRAY_LENGTH(color_coord) == 4",
    output_fields=["id", "color", "color_tag", "color_coord"],
    limit=3
    )

    print(res)

    # Output
    #
    # [
    # {
    # "id": 1,
    # "color": "pink",
    # "color_tag": 6708,
    # "color_coord": [
    # 15,
    # 36,
    # 38,
    # 2
    # ]
    # },
    # {
    # "id": 4,
    # "color": "green",
    # "color_tag": 5386,
    # "color_coord": [
    # 13,
    # 32,
    # 35,
    # 5
    # ]
    # },
    # {
    # "id": 5,
    # "color": "blue",
    # "color_tag": 9643,
    # "color_coord": [
    # 8,
    # 20,
    # 20,
    # 11
    # ]
    # }
    # ]