Skip to content

Commit 715c0c6

Browse files
committed
apacheGH-40933: [Java] Enhance the copyFrom* functionality in StringView (apache#41752)
### Rationale for this change Initial implementation of StringView doesn't contain `copy` functionality. This PR adds that feature. ### What changes are included in this PR? This PR adds `copyFrom` and `copyFromSafe` functions to `BaseVariableWidthViewVector`. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: apache#40933 Lead-authored-by: Vibhatha Abeykoon <vibhatha@gmail.com> Co-authored-by: Vibhatha Lakmal Abeykoon <vibhatha@gmail.com> Signed-off-by: David Li <li.davidm96@gmail.com>
1 parent 28df343 commit 715c0c6

File tree

3 files changed

+243
-8
lines changed

3 files changed

+243
-8
lines changed

java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java

+45-7
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import org.apache.arrow.memory.util.ByteFunctionHelpers;
3434
import org.apache.arrow.memory.util.CommonUtil;
3535
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
36+
import org.apache.arrow.util.Preconditions;
3637
import org.apache.arrow.vector.compare.VectorVisitor;
3738
import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
3839
import org.apache.arrow.vector.types.pojo.Field;
@@ -1334,30 +1335,67 @@ protected final void handleSafe(int index, int dataLength) {
13341335
/**
13351336
* Copy a cell value from a particular index in source vector to a particular position in this
13361337
* vector.
1337-
* TODO: Improve functionality to support copying views.
1338-
* <a href="https://github.com/apache/arrow/issues/40933">Enhance CopyFrom</a>
1339-
*
13401338
* @param fromIndex position to copy from in source vector
13411339
* @param thisIndex position to copy to in this vector
13421340
* @param from source vector
13431341
*/
13441342
@Override
13451343
public void copyFrom(int fromIndex, int thisIndex, ValueVector from) {
1346-
throw new UnsupportedOperationException("copyFrom is not supported for VariableWidthVector");
1344+
Preconditions.checkArgument(getMinorType() == from.getMinorType());
1345+
if (from.isNull(fromIndex)) {
1346+
BitVectorHelper.unsetBit(validityBuffer, thisIndex);
1347+
} else {
1348+
final int viewLength = from.getDataBuffer().getInt((long) fromIndex * ELEMENT_SIZE);
1349+
BitVectorHelper.setBit(validityBuffer, thisIndex);
1350+
final int start = thisIndex * ELEMENT_SIZE;
1351+
final int copyStart = fromIndex * ELEMENT_SIZE;
1352+
from.getDataBuffer().getBytes(start, viewBuffer, copyStart, ELEMENT_SIZE);
1353+
if (viewLength > INLINE_SIZE) {
1354+
final int bufIndex = from.getDataBuffer().getInt(((long) fromIndex * ELEMENT_SIZE) +
1355+
LENGTH_WIDTH + PREFIX_WIDTH);
1356+
final int dataOffset = from.getDataBuffer().getInt(((long) fromIndex * ELEMENT_SIZE) +
1357+
LENGTH_WIDTH + PREFIX_WIDTH + BUF_INDEX_WIDTH);
1358+
final ArrowBuf dataBuf = ((BaseVariableWidthViewVector) from).dataBuffers.get(bufIndex);
1359+
final ArrowBuf thisDataBuf = allocateOrGetLastDataBuffer(viewLength);
1360+
thisDataBuf.setBytes(thisDataBuf.writerIndex(), dataBuf, dataOffset, viewLength);
1361+
thisDataBuf.writerIndex(thisDataBuf.writerIndex() + viewLength);
1362+
}
1363+
}
1364+
lastSet = thisIndex;
13471365
}
13481366

13491367
/**
13501368
* Same as {@link #copyFrom(int, int, ValueVector)} except that it handles the case when the
13511369
* capacity of the vector needs to be expanded before copy.
1352-
* TODO: Improve functionality to support copying views.
1353-
* <a href="https://github.com/apache/arrow/issues/40933">Enhance CopyFrom</a>
13541370
* @param fromIndex position to copy from in source vector
13551371
* @param thisIndex position to copy to in this vector
13561372
* @param from source vector
13571373
*/
13581374
@Override
13591375
public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) {
1360-
throw new UnsupportedOperationException("copyFromSafe is not supported for VariableWidthVector");
1376+
Preconditions.checkArgument(getMinorType() == from.getMinorType());
1377+
if (from.isNull(fromIndex)) {
1378+
handleSafe(thisIndex, 0);
1379+
BitVectorHelper.unsetBit(validityBuffer, thisIndex);
1380+
} else {
1381+
final int viewLength = from.getDataBuffer().getInt((long) fromIndex * ELEMENT_SIZE);
1382+
handleSafe(thisIndex, viewLength);
1383+
BitVectorHelper.setBit(validityBuffer, thisIndex);
1384+
final int start = thisIndex * ELEMENT_SIZE;
1385+
final int copyStart = fromIndex * ELEMENT_SIZE;
1386+
from.getDataBuffer().getBytes(start, viewBuffer, copyStart, ELEMENT_SIZE);
1387+
if (viewLength > INLINE_SIZE) {
1388+
final int bufIndex = from.getDataBuffer().getInt(((long) fromIndex * ELEMENT_SIZE) +
1389+
LENGTH_WIDTH + PREFIX_WIDTH);
1390+
final int dataOffset = from.getDataBuffer().getInt(((long) fromIndex * ELEMENT_SIZE) +
1391+
LENGTH_WIDTH + PREFIX_WIDTH + BUF_INDEX_WIDTH);
1392+
final ArrowBuf dataBuf = ((BaseVariableWidthViewVector) from).dataBuffers.get(bufIndex);
1393+
final ArrowBuf thisDataBuf = allocateOrGetLastDataBuffer(viewLength);
1394+
thisDataBuf.setBytes(thisDataBuf.writerIndex(), dataBuf, dataOffset, viewLength);
1395+
thisDataBuf.writerIndex(thisDataBuf.writerIndex() + viewLength);
1396+
}
1397+
}
1398+
lastSet = thisIndex;
13611399
}
13621400

13631401
@Override

java/vector/src/main/java/org/apache/arrow/vector/types/Types.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,7 @@ public FieldWriter getNewFieldWriter(ValueVector vector) {
568568
return new VarBinaryWriterImpl((VarBinaryVector) vector);
569569
}
570570
},
571-
VIEWVARBINARY(Binary.INSTANCE) {
571+
VIEWVARBINARY(BinaryView.INSTANCE) {
572572
@Override
573573
public FieldVector getNewVector(
574574
Field field,

java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java

+197
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
import java.util.List;
3737
import java.util.Objects;
3838
import java.util.Random;
39+
import java.util.function.Function;
40+
import java.util.stream.Stream;
3941

4042
import org.apache.arrow.memory.ArrowBuf;
4143
import org.apache.arrow.memory.BufferAllocator;
@@ -52,6 +54,9 @@
5254
import org.junit.jupiter.api.AfterEach;
5355
import org.junit.jupiter.api.BeforeEach;
5456
import org.junit.jupiter.api.Test;
57+
import org.junit.jupiter.params.ParameterizedTest;
58+
import org.junit.jupiter.params.provider.Arguments;
59+
import org.junit.jupiter.params.provider.MethodSource;
5560

5661

5762
public class TestVarCharViewVector {
@@ -1517,6 +1522,198 @@ public void testVectorLoadUnload() {
15171522
}
15181523
}
15191524

1525+
static Stream<Arguments> vectorCreatorProvider() {
1526+
return Stream.of(
1527+
Arguments.of((Function<BufferAllocator, BaseVariableWidthViewVector>)
1528+
(allocator -> newVector(ViewVarBinaryVector.class, EMPTY_SCHEMA_PATH,
1529+
Types.MinorType.VIEWVARBINARY, allocator))),
1530+
Arguments.of((Function<BufferAllocator, BaseVariableWidthViewVector>)
1531+
(allocator -> newVector(ViewVarCharVector.class, EMPTY_SCHEMA_PATH,
1532+
Types.MinorType.VIEWVARCHAR, allocator)))
1533+
);
1534+
}
1535+
1536+
@ParameterizedTest
1537+
@MethodSource({"vectorCreatorProvider"})
1538+
public void testCopyFromWithNulls(Function<BufferAllocator, BaseVariableWidthViewVector> vectorCreator) {
1539+
try (final BaseVariableWidthViewVector vector = vectorCreator.apply(allocator);
1540+
final BaseVariableWidthViewVector vector2 = vectorCreator.apply(allocator)) {
1541+
final int initialCapacity = 1024;
1542+
vector.setInitialCapacity(initialCapacity);
1543+
vector.allocateNew();
1544+
int capacity = vector.getValueCapacity();
1545+
assertTrue(capacity >= initialCapacity);
1546+
1547+
// setting number of values such that we have enough space in the initial allocation
1548+
// to avoid re-allocation. This is to test copyFrom() without re-allocation.
1549+
final int numberOfValues = initialCapacity / 2 / ViewVarCharVector.ELEMENT_SIZE;
1550+
1551+
final String prefixString = generateRandomString(12);
1552+
1553+
for (int i = 0; i < numberOfValues; i++) {
1554+
if (i % 3 == 0) {
1555+
// null values
1556+
vector.setNull(i);
1557+
} else if (i % 3 == 1) {
1558+
// short strings
1559+
byte[] b = Integer.toString(i).getBytes(StandardCharsets.UTF_8);
1560+
vector.set(i, b, 0, b.length);
1561+
} else {
1562+
// long strings
1563+
byte[] b = (i + prefixString).getBytes(StandardCharsets.UTF_8);
1564+
vector.set(i, b, 0, b.length);
1565+
}
1566+
}
1567+
1568+
assertEquals(capacity, vector.getValueCapacity());
1569+
1570+
vector.setValueCount(numberOfValues);
1571+
1572+
for (int i = 0; i < numberOfValues; i++) {
1573+
if (i % 3 == 0) {
1574+
assertNull(vector.getObject(i));
1575+
} else if (i % 3 == 1) {
1576+
assertArrayEquals(Integer.toString(i).getBytes(StandardCharsets.UTF_8),
1577+
vector.get(i),
1578+
"unexpected value at index: " + i);
1579+
} else {
1580+
assertArrayEquals((i + prefixString).getBytes(StandardCharsets.UTF_8),
1581+
vector.get(i),
1582+
"unexpected value at index: " + i);
1583+
}
1584+
}
1585+
1586+
vector2.setInitialCapacity(initialCapacity);
1587+
vector2.allocateNew();
1588+
int capacity2 = vector2.getValueCapacity();
1589+
assertEquals(capacity2, capacity);
1590+
1591+
for (int i = 0; i < numberOfValues; i++) {
1592+
vector2.copyFrom(i, i, vector);
1593+
if (i % 3 == 0) {
1594+
assertNull(vector2.getObject(i));
1595+
} else if (i % 3 == 1) {
1596+
assertArrayEquals(Integer.toString(i).getBytes(StandardCharsets.UTF_8),
1597+
vector.get(i),
1598+
"unexpected value at index: " + i);
1599+
} else {
1600+
assertArrayEquals((i + prefixString).getBytes(StandardCharsets.UTF_8),
1601+
vector.get(i),
1602+
"unexpected value at index: " + i);
1603+
}
1604+
}
1605+
1606+
assertEquals(capacity, vector2.getValueCapacity());
1607+
1608+
vector2.setValueCount(numberOfValues);
1609+
1610+
for (int i = 0; i < numberOfValues; i++) {
1611+
if (i % 3 == 0) {
1612+
assertNull(vector2.getObject(i));
1613+
} else if (i % 3 == 1) {
1614+
assertArrayEquals(Integer.toString(i).getBytes(StandardCharsets.UTF_8),
1615+
vector.get(i),
1616+
"unexpected value at index: " + i);
1617+
} else {
1618+
assertArrayEquals((i + prefixString).getBytes(StandardCharsets.UTF_8),
1619+
vector.get(i),
1620+
"unexpected value at index: " + i);
1621+
}
1622+
}
1623+
}
1624+
}
1625+
1626+
@ParameterizedTest
1627+
@MethodSource("vectorCreatorProvider")
1628+
public void testCopyFromSafeWithNulls(Function<BufferAllocator, BaseVariableWidthViewVector> vectorCreator) {
1629+
try (final BaseVariableWidthViewVector vector = vectorCreator.apply(allocator);
1630+
final BaseVariableWidthViewVector vector2 = vectorCreator.apply(allocator)) {
1631+
1632+
final int initialCapacity = 4096;
1633+
vector.setInitialCapacity(initialCapacity);
1634+
vector.allocateNew();
1635+
int capacity = vector.getValueCapacity();
1636+
assertTrue(capacity >= initialCapacity);
1637+
1638+
final int numberOfValues = initialCapacity / ViewVarCharVector.ELEMENT_SIZE;
1639+
1640+
final String prefixString = generateRandomString(12);
1641+
1642+
for (int i = 0; i < numberOfValues; i++) {
1643+
if (i % 3 == 0) {
1644+
// null values
1645+
vector.setNull(i);
1646+
} else if (i % 3 == 1) {
1647+
// short strings
1648+
byte[] b = Integer.toString(i).getBytes(StandardCharsets.UTF_8);
1649+
vector.setSafe(i, b, 0, b.length);
1650+
} else {
1651+
// long strings
1652+
byte[] b = (i + prefixString).getBytes(StandardCharsets.UTF_8);
1653+
vector.setSafe(i, b, 0, b.length);
1654+
}
1655+
}
1656+
1657+
/* NO reAlloc() should have happened in setSafe() */
1658+
assertEquals(capacity, vector.getValueCapacity());
1659+
1660+
vector.setValueCount(numberOfValues);
1661+
1662+
for (int i = 0; i < numberOfValues; i++) {
1663+
if (i % 3 == 0) {
1664+
assertNull(vector.getObject(i));
1665+
} else if (i % 3 == 1) {
1666+
assertArrayEquals(Integer.toString(i).getBytes(StandardCharsets.UTF_8),
1667+
vector.get(i),
1668+
"unexpected value at index: " + i);
1669+
} else {
1670+
assertArrayEquals((i + prefixString).getBytes(StandardCharsets.UTF_8),
1671+
vector.get(i),
1672+
"unexpected value at index: " + i);
1673+
}
1674+
}
1675+
1676+
vector2.setInitialCapacity(initialCapacity);
1677+
vector2.allocateNew();
1678+
int capacity2 = vector2.getValueCapacity();
1679+
assertEquals(capacity2, capacity);
1680+
1681+
for (int i = 0; i < numberOfValues; i++) {
1682+
vector2.copyFromSafe(i, i, vector);
1683+
if (i % 3 == 0) {
1684+
assertNull(vector2.getObject(i));
1685+
} else if (i % 3 == 1) {
1686+
assertArrayEquals(Integer.toString(i).getBytes(StandardCharsets.UTF_8),
1687+
vector.get(i),
1688+
"unexpected value at index: " + i);
1689+
} else {
1690+
assertArrayEquals((i + prefixString).getBytes(StandardCharsets.UTF_8),
1691+
vector.get(i),
1692+
"unexpected value at index: " + i);
1693+
}
1694+
}
1695+
1696+
/* NO reAlloc() should have happened in setSafe() */
1697+
assertEquals(capacity, vector2.getValueCapacity());
1698+
1699+
vector2.setValueCount(numberOfValues);
1700+
1701+
for (int i = 0; i < numberOfValues; i++) {
1702+
if (i % 3 == 0) {
1703+
assertNull(vector2.getObject(i));
1704+
} else if (i % 3 == 1) {
1705+
assertArrayEquals(Integer.toString(i).getBytes(StandardCharsets.UTF_8),
1706+
vector.get(i),
1707+
"unexpected value at index: " + i);
1708+
} else {
1709+
assertArrayEquals((i + prefixString).getBytes(StandardCharsets.UTF_8),
1710+
vector.get(i),
1711+
"unexpected value at index: " + i);
1712+
}
1713+
}
1714+
}
1715+
}
1716+
15201717
private String generateRandomString(int length) {
15211718
Random random = new Random();
15221719
StringBuilder sb = new StringBuilder(length);

0 commit comments

Comments
 (0)